diff --git a/CMakeLists.txt b/CMakeLists.txt index fb763546..fcbea720 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -588,7 +588,7 @@ if(MGE_WITH_CUDA) set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") if(MSVC OR WIN32) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all") - set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14") + set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj") if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd") endif() diff --git a/dnn/src/aarch64/relayout/opr_impl.cpp b/dnn/src/aarch64/relayout/opr_impl.cpp index cbfece6e..6827f4fa 100644 --- a/dnn/src/aarch64/relayout/opr_impl.cpp +++ b/dnn/src/aarch64/relayout/opr_impl.cpp @@ -365,27 +365,22 @@ void aarch64::RelayoutForwardImpl::exec( relayout::TransposeParam trans_param; bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param, true); if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { - auto sptr = static_cast(src.raw_ptr), - dptr = static_cast(dst.raw_ptr); MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose( - trans_param.batch, trans_param.m, trans_param.n, sptr, dptr, - trans_param.stride_m)); + trans_param.batch, trans_param.m, trans_param.n, + static_cast(src.raw_ptr()), + static_cast(dst.raw_ptr()), trans_param.stride_m)); return; } else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 2) { - auto sptr = static_cast(src.raw_ptr), - dptr = static_cast(dst.raw_ptr); - MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose( - trans_param.batch, trans_param.m, trans_param.n, sptr, dptr, - trans_param.stride_m)); + trans_param.batch, trans_param.m, trans_param.n, + static_cast(src.raw_ptr()), + static_cast(dst.raw_ptr()), trans_param.stride_m)); return; } else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 4) { - auto sptr = static_cast(src.raw_ptr), - dptr = static_cast(dst.raw_ptr); - MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose( - trans_param.batch, trans_param.m, trans_param.n, sptr, dptr, - trans_param.stride_m)); + trans_param.batch, trans_param.m, trans_param.n, + static_cast(src.raw_ptr()), + static_cast(dst.raw_ptr()), trans_param.stride_m)); return; } diff --git a/dnn/src/aarch64/rotate/opr_impl.cpp b/dnn/src/aarch64/rotate/opr_impl.cpp index b80783e2..e87430c2 100644 --- a/dnn/src/aarch64/rotate/opr_impl.cpp +++ b/dnn/src/aarch64/rotate/opr_impl.cpp @@ -358,11 +358,13 @@ void RotateImpl::exec( return fallback::RotateImpl::exec(src, dst, workspace); } + auto clockwise = param().clockwise; + MEGDNN_DISPATCH_CPU_KERN_OPR({ for (size_t i = 0; i < src.layout.shape[0]; ++i) { Mat src_mat = TensorND2Mat(src, i); Mat dst_mat = TensorND2Mat(dst, i); - rotate(src_mat, dst_mat, param().clockwise); + rotate(src_mat, dst_mat, clockwise); } }); } diff --git a/dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp b/dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp index 91108e23..46f832ec 100644 --- a/dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp +++ b/dnn/src/aarch64/warp_perspective/warp_perspective_cv.cpp @@ -205,16 +205,16 @@ void megdnn::aarch64::warp_perspective_cv_exec( megdnn_assert( ch == 1 || ch == 3 || ch == 2, "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); - const float* trans_ptr = trans.ptr(); - const int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { - megdnn_assert(mat_idx.layout.ndim == 1); - midx_ptr = mat_idx.ptr(); - } if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ + const float* trans_ptr = trans.ptr(); \ + const int* midx_ptr = nullptr; \ + if (mat_idx.raw_ptr()) { \ + megdnn_assert(mat_idx.layout.ndim == 1); \ + midx_ptr = mat_idx.ptr(); \ + } \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ @@ -240,8 +240,14 @@ void megdnn::aarch64::warp_perspective_cv_exec( #undef cb } else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ + const float* trans_ptr = trans.ptr(); \ + const int* midx_ptr = nullptr; \ + if (mat_idx.raw_ptr()) { \ + megdnn_assert(mat_idx.layout.ndim == 1); \ + midx_ptr = mat_idx.ptr(); \ + } \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp index a8c8ea09..ae00ac27 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp @@ -531,10 +531,10 @@ SmallVector ConvBiasImpl::AlgoI8x8x16Stride2Filter2:: megdnn_arm_common_conv_bias_int8816_kimpl, midout_iv("AlgoI8x8x16Stride2Filter2::dispatch_kerns"_hash)) { auto ncb_param = param; - ncb_param.src_ptr = param.src(0, ncb_index.ndrange_id[0]); - ncb_param.dst_ptr = param.dst(0, ncb_index.ndrange_id[0]); - ncb_param.filter_ptr = param.filter(ncb_index.ndrange_id[0]); - ncb_param.bias_ptr = param.bias(0, ncb_index.ndrange_id[0]); + ncb_param.src_ptr += param.src_offset(0, ncb_index.ndrange_id[0]); + ncb_param.dst_ptr += param.dst_offset(0, ncb_index.ndrange_id[0]); + ncb_param.filter_ptr += param.filter_offset(ncb_index.ndrange_id[0]); + ncb_param.bias_ptr += param.bias_offset(0, ncb_index.ndrange_id[0]); conv_bias::conv_int8x8x16_stride2_flt2(ncb_param); } MIDOUT_END(); diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp index 231c2dfd..bf3279aa 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw_nchw44_algo.cpp @@ -133,7 +133,8 @@ static void pack_weight( constexpr int pack_oc = 8; if (kern_param.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS && oc % pack_oc != 0) { auto packed_bias = reinterpret_cast(bundle.get(2)); - memcpy(packed_bias, kern_param.bias_ptr, round_up(oc, 8) * sizeof(int16_t)); + memcpy(packed_bias, kern_param.bias_ptr.get_ptr(), + round_up(oc, 8) * sizeof(int16_t)); } } diff --git a/dnn/src/arm_common/cvt_color/opr_impl.cpp b/dnn/src/arm_common/cvt_color/opr_impl.cpp index 56ce613b..c30e78c2 100644 --- a/dnn/src/arm_common/cvt_color/opr_impl.cpp +++ b/dnn/src/arm_common/cvt_color/opr_impl.cpp @@ -1657,4 +1657,4 @@ void CvtColorImpl::exec( } // namespace arm_common } // namespace megdnn -// vim: syntax=cpp.doxygen +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/arm_common/elemwise/binary/algo.cpp b/dnn/src/arm_common/elemwise/binary/algo.cpp index 2bdd0ea1..515d1ae0 100644 --- a/dnn/src/arm_common/elemwise/binary/algo.cpp +++ b/dnn/src/arm_common/elemwise/binary/algo.cpp @@ -220,9 +220,9 @@ void ElemwiseImpl::AlgoBinaryVecVec::exec(const KernParam& kern_param) const { run = OpCallerBinary<_op<_type, _type>, BcastType::VEC_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ @@ -254,9 +254,9 @@ void ElemwiseImpl::AlgoBinaryVecScalar::exec(const KernParam& kern_param) const _op<_type, _type>, BcastType::VEC_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ @@ -280,9 +280,9 @@ void ElemwiseImpl::AlgoBinaryVecScalar::exec(const KernParam& kern_param) const _op<_type, _type>, BcastType::SCALAR_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr)[0], \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr())[0], \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, \ src1.layout.total_nr_elems())); \ } \ @@ -318,9 +318,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101::exec(const KernParam& kern_param) cons _op<_type, _type>, BcastType::VEC_BCAST101>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -347,9 +347,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101::exec(const KernParam& kern_param) cons _op<_type, _type>, BcastType::BCAST101_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -384,9 +384,9 @@ void ElemwiseImpl::AlgoBinaryVecBcastX0X::exec(const KernParam& kern_param) cons _op<_type, _type>, BcastType::VEC_BCASTX0X>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -413,9 +413,9 @@ void ElemwiseImpl::AlgoBinaryVecBcastX0X::exec(const KernParam& kern_param) cons _op<_type, _type>, BcastType::BCASTX0X_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -450,9 +450,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast111C::exec(const KernParam& kern_param) con _op<_type, _type>, BcastType::VEC_BCAST111C>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -479,9 +479,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast111C::exec(const KernParam& kern_param) con _op<_type, _type>, BcastType::BCAST111C_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, \ binfo.z)); \ } \ @@ -519,9 +519,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101xX::exec(const KernParam& kern_param) co _op<_type, _type>, BcastType::VEC_BCAST101xX>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, batch_size, binfo.x, \ binfo.y, binfo.z)); \ } \ @@ -551,9 +551,9 @@ void ElemwiseImpl::AlgoBinaryVecBcast101xX::exec(const KernParam& kern_param) co _op<_type, _type>, BcastType::BCAST101xX_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, batch_size, binfo.x, \ binfo.y, binfo.z)); \ } \ diff --git a/dnn/src/arm_common/elemwise/ternary/algo.cpp b/dnn/src/arm_common/elemwise/ternary/algo.cpp index d5bb9359..1016f2e2 100644 --- a/dnn/src/arm_common/elemwise/ternary/algo.cpp +++ b/dnn/src/arm_common/elemwise/ternary/algo.cpp @@ -79,10 +79,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecVecVec::exec(const KernParam& kern_param) c _op<_type, _type>, BcastType::VEC_VEC_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ @@ -113,10 +113,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecVecScalar::exec( _op<_type, _type>, BcastType::VEC_VEC_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ @@ -149,10 +149,10 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast101VecBcast101::exec( _op<_type, _type>, BcastType::BCAST101_VEC_BCAST101>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ binfo.x, binfo.y, binfo.z)); \ } \ @@ -187,11 +187,11 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast111CVecBcast111C::exec( BcastType::BCAST111C_VEC_BCAST111C>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ is_vector(src1.layout) ? 0 : src1.layout.stride[0] - binfo.z, \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ binfo.x, binfo.y, binfo.z)); \ } \ @@ -228,10 +228,10 @@ void ElemwiseImpl::AlgoTernaryFma3Bcast101xXVecBcast101xX::exec( BcastType::BCAST101xX_VEC_BCAST101xX>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ batch_size, binfo.x, binfo.y, binfo.z)); \ } \ @@ -268,10 +268,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast101xXVec::exec( _op<_type, _type>, BcastType::VEC_BCAST101xX_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ batch_size, binfo.x, binfo.y, binfo.z)); \ } \ @@ -306,10 +306,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast101Vec::exec( _op<_type, _type>, BcastType::VEC_BCAST101_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ binfo.x, binfo.y, binfo.z)); \ } \ @@ -343,12 +343,12 @@ void ElemwiseImpl::AlgoTernaryFma3VecBcast111CVec::exec( _op<_type, _type>, BcastType::VEC_BCAST111C_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ + run(static_cast(src0.raw_ptr()), \ is_vector(src0.layout) ? 0 : src0.layout.stride[0] - binfo.z, \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ is_vector(src2.layout) ? 0 : src2.layout.stride[0] - binfo.z, \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ binfo.x, binfo.y, binfo.z)); \ } \ @@ -380,10 +380,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecScalarVec::exec( _op<_type, _type>, BcastType::VEC_SCALAR_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ @@ -414,10 +414,10 @@ void ElemwiseImpl::AlgoTernaryFma3VecScalarScalar::exec( _op<_type, _type>, BcastType::VEC_SCALAR_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast(src2.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast(src2.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ } \ diff --git a/dnn/src/arm_common/elemwise/unary/algo.cpp b/dnn/src/arm_common/elemwise/unary/algo.cpp index 35ab39e2..f4d48f93 100644 --- a/dnn/src/arm_common/elemwise/unary/algo.cpp +++ b/dnn/src/arm_common/elemwise/unary/algo.cpp @@ -76,8 +76,8 @@ void ElemwiseImpl::AlgoUnary::exec(const KernParam& kern_param) const { size_t offset = task_id * nr_elems_per_thread; \ size_t nr_elems_thread = \ std::min(nr_elems - offset, nr_elems_per_thread); \ - run(static_cast(src0.raw_ptr) + offset, \ - static_cast<_type*>(dst_tensor.raw_ptr) + offset, \ + run(static_cast(src0.raw_ptr()) + offset, \ + static_cast<_type*>(dst_tensor.raw_ptr()) + offset, \ src0.layout.dtype, dst_tensor.layout.dtype, nr_elems_thread); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ diff --git a/dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp b/dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp index da5acbf6..810c20c9 100644 --- a/dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/arm_common/elemwise_multi_type/opr_impl.cpp @@ -148,17 +148,17 @@ void ElemwiseMultiTypeImpl::neon_round_shr_saturate_bcast_scalar( template void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xi8_bcast_scalar( - const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst) { - auto a_ptr = param[0].ptr(); + const ElemwiseOpParamN<2>& param, const TensorND& dst) { auto k = param[1].ptr()[0]; size_t size = param.size; + auto src = param[0]; - MEGDNN_DISPATCH_CPU_KERN_OPR( - neon_round_shr_saturate_bcast_scalar(a_ptr, k, size, dst)); + MEGDNN_DISPATCH_CPU_KERN_OPR(neon_round_shr_saturate_bcast_scalar( + src.ptr(), k, size, static_cast(dst.raw_ptr()))); } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ @@ -282,7 +282,7 @@ void neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_int32( } bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { BroadcastChannelInfo binfo; if (is_vector(param[0].layout) && is_broadcasted_channel_like(param[1].layout, binfo) && @@ -294,16 +294,18 @@ bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr( auto minv = param[4].ptr()[0]; auto maxv = param[5].ptr()[0]; switch (param[0].layout.dtype.enumv()) { -#define DISPATCH(stype, suffix) \ - case DTypeTrait::enumv: { \ - auto x_ptr = param[0].ptr::ctype>(); \ - auto b_ptr = param[1].ptr::ctype>(); \ - auto M = param[2].ptr::ctype>()[0]; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_##suffix( \ - binfo.x, binfo.y, binfo.z, x_ptr, b_ptr, M, offset, minv, \ - maxv, param.size, dst)); \ - break; \ +#define DISPATCH(stype, suffix) \ + case DTypeTrait::enumv: { \ + auto M = param[2].ptr::ctype>()[0]; \ + auto src0 = param[0]; \ + auto src1 = param[1]; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + neon_fuse_add_rmulh_round_shr_saturate_bcast_1c11_##suffix( \ + binfo.x, binfo.y, binfo.z, \ + src0.ptr::ctype>(), \ + src1.ptr::ctype>(), M, offset, minv, maxv, \ + param.size, static_cast(dst.raw_ptr()))); \ + break; \ } DISPATCH(dtype::Int16, int16) DISPATCH(dtype::Int32, int32) @@ -317,7 +319,7 @@ bool ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_rshr( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { if (dispatch_fuse_add_rmulh_rshr(param, dst)) return; fallback::ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( @@ -325,7 +327,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { if (dispatch_fuse_add_rmulh_rshr(param, dst)) return; fallback::ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( diff --git a/dnn/src/arm_common/elemwise_multi_type/opr_impl.h b/dnn/src/arm_common/elemwise_multi_type/opr_impl.h index 96e55962..deb34883 100644 --- a/dnn/src/arm_common/elemwise_multi_type/opr_impl.h +++ b/dnn/src/arm_common/elemwise_multi_type/opr_impl.h @@ -23,18 +23,18 @@ class ElemwiseMultiTypeImpl : public fallback::ElemwiseMultiTypeImpl { template void dispatch_round_shr_saturate_iXxi8xi8_bcast_scalar( - const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst); + const ElemwiseOpParamN<2>& param, const TensorND& dst); bool dispatch_fuse_add_rmulh_rshr( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst); + const ElemwiseOpParamN<6>& param, const TensorND& dst); protected: void on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_quantized_mode( const ElemwiseOpParamN<1>& param, const TensorND& dst, diff --git a/dnn/src/arm_common/pooling/algo.cpp b/dnn/src/arm_common/pooling/algo.cpp index 44b5db4b..201d2172 100644 --- a/dnn/src/arm_common/pooling/algo.cpp +++ b/dnn/src/arm_common/pooling/algo.cpp @@ -117,27 +117,27 @@ void PoolingImpl::AlgoFilterxModexStride1::exec(const PoolingKernParam& param) c auto PW = param.padding[1]; auto FH = param.filter[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; - -#define DISPATCH_FUNC(Pooler, NeonPooler, window, midout_type_id) \ - MIDOUT_BEGIN( \ - megdnn_arm_common_pooling, midout_iv(0), midout_iv(midout_type_id), \ - Pooler::MIDOUT_CASE_NUM, NeonPooler::MIDOUT_CASE_NUM, window) { \ - auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ - src_dtype = param.src_type](size_t index, size_t) { \ - size_t n = index / C; \ - size_t c = index % C; \ - do_pooling_compact( \ - static_cast(src_ptr) + \ - n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + \ - c * OH * OW, \ - src_dtype, IH, IW, OH, OW, PH, PW); \ - }; \ - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ - static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ - } \ + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(Pooler, NeonPooler, window, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_arm_common_pooling, midout_iv(0), midout_iv(midout_type_id), \ + Pooler::MIDOUT_CASE_NUM, NeonPooler::MIDOUT_CASE_NUM, window) { \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ + src_dtype = param.src_type](size_t index, size_t) { \ + size_t n = index / C; \ + size_t c = index % C; \ + do_pooling_compact( \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + \ + n * C * OH * OW + c * OH * OW, \ + src_dtype, IH, IW, OH, OW, PH, PW); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ MIDOUT_END() #define DISPATCH_WINDOW(Pooler, NeonPooler, dtype, ctype, comp_type, midout_type_id) \ @@ -213,26 +213,26 @@ void PoolingImpl::AlgoFilter2ModexStride2::exec(const PoolingKernParam& param) c auto PH = param.padding[0]; auto PW = param.padding[1]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; -#define DISPATCH_FUNC(Pooler, mode, midout_type_id) \ - MIDOUT_BEGIN( \ - megdnn_arm_common_pooling, midout_iv(1), midout_iv(midout_type_id), \ - Pooler::MIDOUT_CASE_NUM) { \ - auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ - src_dtype = param.src_type](size_t index, size_t) { \ - size_t n = index / C; \ - size_t c = index % C; \ - do_pooling_2x2( \ - static_cast(src_ptr) + \ - n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + \ - c * OH * OW, \ - src_dtype, IH, IW, OH, OW, PH, PW); \ - }; \ - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ - static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ - } \ + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; +#define DISPATCH_FUNC(Pooler, mode, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_arm_common_pooling, midout_iv(1), midout_iv(midout_type_id), \ + Pooler::MIDOUT_CASE_NUM) { \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ + src_dtype = param.src_type](size_t index, size_t) { \ + size_t n = index / C; \ + size_t c = index % C; \ + do_pooling_2x2( \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + \ + n * C * OH * OW + c * OH * OW, \ + src_dtype, IH, IW, OH, OW, PH, PW); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ MIDOUT_END() #define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \ @@ -286,8 +286,8 @@ void PoolingImpl::AlgoFilter3MaxStride2::exec(const PoolingKernParam& param) con auto PH = param.padding[0]; auto PW = param.padding[1]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, midout_type_id) \ MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(2), midout_iv(midout_type_id)) { \ @@ -300,9 +300,11 @@ void PoolingImpl::AlgoFilter3MaxStride2::exec(const PoolingKernParam& param) con size_t n = index / C; \ size_t c = index % C; \ do_max_pooling_3x3_s2x2_##func##_NEON( \ - static_cast(src_ptr) + n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \ - IW, OH, OW, PH, PW, ws); \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ @@ -339,8 +341,8 @@ void PoolingImpl::AlgoFilter3AverageStride2::exec(const PoolingKernParam& param) auto PH = param.padding[0]; auto PW = param.padding[1]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, MEGDNN_SIMD_WIDTH, midout_type_id) \ MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(3), midout_iv(midout_type_id)) { \ @@ -353,9 +355,11 @@ void PoolingImpl::AlgoFilter3AverageStride2::exec(const PoolingKernParam& param) size_t n = index / C; \ size_t c = index % C; \ do_average_pooling_3x3_s2x2_NEON( \ - static_cast(src_ptr) + n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \ - IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ @@ -392,8 +396,8 @@ void PoolingImpl::AlgoFilter4MaxStride2::exec(const PoolingKernParam& param) con auto PH = param.padding[0]; auto PW = param.padding[1]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, midout_type_id) \ MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(4), midout_iv(midout_type_id)) { \ @@ -402,8 +406,10 @@ void PoolingImpl::AlgoFilter4MaxStride2::exec(const PoolingKernParam& param) con size_t n = index / C; \ size_t c = index % C; \ do_max_pooling_w4x4_s2x2_##func##_NEON( \ - static_cast(src_ptr) + n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + c * OH * OW, \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ src_dtype, IH, IW, OH, OW, PH, PW); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ @@ -446,8 +452,8 @@ void PoolingImpl::AlgoFilter5MaxStride2::exec(const PoolingKernParam& param) con auto PH = param.padding[0]; auto PW = param.padding[1]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(dtype, type, midout_type_id, MEGDNN_SIMD_WIDTH) \ MIDOUT_BEGIN(megdnn_arm_common_pooling, midout_iv(5), midout_iv(midout_type_id)) { \ @@ -460,9 +466,11 @@ void PoolingImpl::AlgoFilter5MaxStride2::exec(const PoolingKernParam& param) con size_t n = index / C; \ size_t c = index % C; \ do_max_pooling_w5x5_s2x2_NEON( \ - static_cast(src_ptr) + n * C * IH * IW + c * IH * IW, \ - static_cast(dst_ptr) + n * C * OH * OW + c * OH * OW, IH, \ - IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ @@ -593,8 +601,8 @@ void PoolingImpl::AlgoFilter3ModexStridexNCHW44::exec( auto PW = param.padding[1]; auto SW = param.stride[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, i, mode) \ MIDOUT_BEGIN( \ @@ -608,9 +616,9 @@ void PoolingImpl::AlgoFilter3ModexStridexNCHW44::exec( size_t n = index / C; \ size_t c = index % C; \ do_##mode##_pooling_3x3_stride##i##_##func##_nchw44_NEON( \ - static_cast(src_ptr) + n * C * IH * IW * 4 + \ - c * IH * IW * 4, \ - static_cast(dst_ptr) + n * C * OH * OW * 4 + \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW * 4 + c * IH * IW * 4, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \ c * OH * OW * 4, \ IH, IW, OH, OW, PH, PW, ws); \ }; \ @@ -685,8 +693,8 @@ void PoolingImpl::AlgoFilter2ModexStridexNCHW44::exec( auto PW = param.padding[1]; auto SW = param.stride[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, i, mode) \ MIDOUT_BEGIN( \ @@ -700,9 +708,9 @@ void PoolingImpl::AlgoFilter2ModexStridexNCHW44::exec( size_t n = index / C; \ size_t c = index % C; \ do_##mode##_pooling_2x2_stride##i##_##func##_nchw44_NEON( \ - static_cast(src_ptr) + n * C * IH * IW * 4 + \ - c * IH * IW * 4, \ - static_cast(dst_ptr) + n * C * OH * OW * 4 + \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW * 4 + c * IH * IW * 4, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \ c * OH * OW * 4, \ IH, IW, OH, OW, PH, PW, ws); \ }; \ @@ -778,8 +786,8 @@ void PoolingImpl::AlgoFilter4ModexStridexNCHW44::exec( auto PW = param.padding[1]; auto SW = param.stride[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, i, mode) \ MIDOUT_BEGIN( \ @@ -793,9 +801,9 @@ void PoolingImpl::AlgoFilter4ModexStridexNCHW44::exec( size_t n = index / C; \ size_t c = index % C; \ do_##mode##_pooling_4x4_stride##i##_##func##_nchw44_NEON( \ - static_cast(src_ptr) + n * C * IH * IW * 4 + \ - c * IH * IW * 4, \ - static_cast(dst_ptr) + n * C * OH * OW * 4 + \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW * 4 + c * IH * IW * 4, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \ c * OH * OW * 4, \ IH, IW, OH, OW, PH, PW, ws); \ }; \ @@ -870,8 +878,8 @@ void PoolingImpl::AlgoFilter5ModexStridexNCHW44::exec( auto PW = param.padding[1]; auto SW = param.stride[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(type, func, i, mode) \ MIDOUT_BEGIN( \ @@ -885,9 +893,9 @@ void PoolingImpl::AlgoFilter5ModexStridexNCHW44::exec( size_t n = index / C; \ size_t c = index % C; \ do_##mode##_pooling_5x5_stride##i##_##func##_nchw44_NEON( \ - static_cast(src_ptr) + n * C * IH * IW * 4 + \ - c * IH * IW * 4, \ - static_cast(dst_ptr) + n * C * OH * OW * 4 + \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW * 4 + c * IH * IW * 4, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW * 4 + \ c * OH * OW * 4, \ IH, IW, OH, OW, PH, PW, ws); \ }; \ diff --git a/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp b/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp index 3c341c68..17e2d71e 100644 --- a/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp +++ b/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp @@ -50,8 +50,8 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( int sh = param.stride[0]; int fh = param.filter[0]; - void* src_ptr = param.src_ptr; - void* dst_ptr = param.dst_ptr; + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; #define DISPATCH_FUNC(filter, stride, mode) \ MIDOUT_BEGIN( \ @@ -60,9 +60,10 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \ const int c_idx = index; \ pooling_fp32_nchw44( \ - static_cast(src_ptr) + c_idx * ih * iw * 4, \ - static_cast(dst_ptr) + c_idx * oh * ow * 4, ih, iw, oh, \ - ow, ph, pw); \ + static_cast(src_ptr.get_ptr()) + \ + c_idx * ih * iw * 4, \ + static_cast(dst_ptr.get_ptr()) + c_idx * oh * ow * 4, ih, \ + iw, oh, ow, ph, pw); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast<::megdnn::naive::HandleImpl*>(param.handle), n* ic, run); \ diff --git a/dnn/src/arm_common/pooling/opr_impl.cpp b/dnn/src/arm_common/pooling/opr_impl.cpp index 587a738d..223a438e 100644 --- a/dnn/src/arm_common/pooling/opr_impl.cpp +++ b/dnn/src/arm_common/pooling/opr_impl.cpp @@ -89,8 +89,8 @@ PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( PoolingKernParam ret; static_cast(ret) = make_pooling_kern_szie_param(opr, src.layout, dst.layout); - ret.src_ptr = src.raw_ptr; - ret.dst_ptr = dst.raw_ptr; + ret.src_ptr = src.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); ret.workspace_ptr = workspace.raw_ptr; ret.workspace_size = workspace.size; return ret; diff --git a/dnn/src/arm_common/pooling/opr_impl.h b/dnn/src/arm_common/pooling/opr_impl.h index 229259d0..9f2590e1 100644 --- a/dnn/src/arm_common/pooling/opr_impl.h +++ b/dnn/src/arm_common/pooling/opr_impl.h @@ -56,21 +56,21 @@ public: }; struct PoolingKernParam : public PoolingKernSizeParam { - void* src_ptr; - void* dst_ptr; + RefPtr src_ptr; + RefPtr dst_ptr; void* workspace_ptr; size_t workspace_size; template const T* src() const { src_type.assert_is_compatible_ctype(); - return static_cast(src_ptr); + return static_cast(src_ptr.get_ptr()); } template T* dst() const { dst_type.assert_is_compatible_ctype(); - return static_cast(dst_ptr); + return static_cast(dst_ptr.get_ptr()); } template diff --git a/dnn/src/arm_common/reduce/opr_impl.cpp b/dnn/src/arm_common/reduce/opr_impl.cpp index 815d0628..d67f96a4 100644 --- a/dnn/src/arm_common/reduce/opr_impl.cpp +++ b/dnn/src/arm_common/reduce/opr_impl.cpp @@ -816,8 +816,8 @@ void ReduceImpl::exec( MIDOUT_BEGIN( \ megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \ MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ - reinterpret_cast(src.raw_ptr), \ - reinterpret_cast(dst.raw_ptr), src_type, A, B, C)); \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ execed = true; \ } \ MIDOUT_END(); \ @@ -828,8 +828,8 @@ void ReduceImpl::exec( MIDOUT_BEGIN( \ megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \ MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ - reinterpret_cast(src.raw_ptr), \ - reinterpret_cast(dst.raw_ptr), src_type, A, B, C)); \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ execed = true; \ } \ MIDOUT_END(); \ diff --git a/dnn/src/arm_common/resize/direct_nchwxx.cpp b/dnn/src/arm_common/resize/direct_nchwxx.cpp index a575477a..6a212977 100644 --- a/dnn/src/arm_common/resize/direct_nchwxx.cpp +++ b/dnn/src/arm_common/resize/direct_nchwxx.cpp @@ -72,14 +72,14 @@ void resize_direct_nchwxx( void megdnn::arm_common::resize_direct_nearest_nchw44_fp32( const ResizeImpl::KernParam& kern_param) { resize_direct_nchwxx( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4, kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow); } void megdnn::arm_common::resize_direct_linear_nchw44_fp32( const ResizeImpl::KernParam& kern_param) { resize_direct_nchwxx( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4, kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow); } @@ -87,8 +87,8 @@ void megdnn::arm_common::resize_direct_linear_nchw44_fp32( void megdnn::arm_common::resize_direct_nearest_nchw88_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); resize_direct_nchwxx<__fp16, InterpolationMode::INTER_NEAREST>( sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow); @@ -96,8 +96,8 @@ void megdnn::arm_common::resize_direct_nearest_nchw88_fp16( void megdnn::arm_common::resize_direct_linear_nchw88_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); resize_direct_nchwxx<__fp16, InterpolationMode::INTER_LINEAR>( sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw, kern_param.oh, kern_param.ow); diff --git a/dnn/src/arm_common/resize/upsample2_nchw.cpp b/dnn/src/arm_common/resize/upsample2_nchw.cpp index 8f139701..40d26470 100644 --- a/dnn/src/arm_common/resize/upsample2_nchw.cpp +++ b/dnn/src/arm_common/resize/upsample2_nchw.cpp @@ -191,14 +191,14 @@ void nearest_upsample2_nchw( void megdnn::arm_common::resize_linear_upsample2_nchw_fp32( const ResizeImpl::KernParam& kern_param) { linear_upsample2_nchw( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c, kern_param.ih, kern_param.iw); } void megdnn::arm_common::resize_nearest_upsample2_nchw_fp32( const ResizeImpl::KernParam& kern_param) { nearest_upsample2_nchw( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c, kern_param.ih, kern_param.iw); } @@ -206,16 +206,16 @@ void megdnn::arm_common::resize_nearest_upsample2_nchw_fp32( void megdnn::arm_common::resize_linear_upsample2_nchw_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); linear_upsample2_nchw( sptr, dptr, kern_param.n * kern_param.c, kern_param.ih, kern_param.iw); } void megdnn::arm_common::resize_nearest_upsample2_nchw_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); nearest_upsample2_nchw( sptr, dptr, kern_param.n * kern_param.c, kern_param.ih, kern_param.iw); } diff --git a/dnn/src/arm_common/resize/upsample2_nchwxx.cpp b/dnn/src/arm_common/resize/upsample2_nchwxx.cpp index db5b2bc7..59a4b18b 100644 --- a/dnn/src/arm_common/resize/upsample2_nchwxx.cpp +++ b/dnn/src/arm_common/resize/upsample2_nchwxx.cpp @@ -158,14 +158,14 @@ void nearest_upsample2_nchwxx( void megdnn::arm_common::resize_linear_upsample2_nchw44_fp32( const ResizeImpl::KernParam& kern_param) { linear_upsample2_nchwxx( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4, kern_param.ih, kern_param.iw); } void megdnn::arm_common::resize_nearest_upsample2_nchw44_fp32( const ResizeImpl::KernParam& kern_param) { nearest_upsample2_nchwxx( - kern_param.sptr, kern_param.dptr, kern_param.n * kern_param.c / 4, + kern_param.src(), kern_param.dst(), kern_param.n * kern_param.c / 4, kern_param.ih, kern_param.iw); } @@ -173,16 +173,16 @@ void megdnn::arm_common::resize_nearest_upsample2_nchw44_fp32( void megdnn::arm_common::resize_linear_upsample2_nchw88_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); linear_upsample2_nchwxx( sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw); } void megdnn::arm_common::resize_nearest_upsample2_nchw88_fp16( const ResizeImpl::KernParam& kern_param) { - auto sptr = reinterpret_cast(kern_param.sptr); - auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr); + auto sptr = reinterpret_cast(kern_param.sptr.get_ptr()); + auto dptr = reinterpret_cast<__fp16*>(kern_param.dptr.get_ptr()); nearest_upsample2_nchwxx( sptr, dptr, kern_param.n * kern_param.c / 8, kern_param.ih, kern_param.iw); } diff --git a/dnn/src/arm_common/separable_filter/opr_impl.cpp b/dnn/src/arm_common/separable_filter/opr_impl.cpp index 1482cdfd..0e4c5e9c 100644 --- a/dnn/src/arm_common/separable_filter/opr_impl.cpp +++ b/dnn/src/arm_common/separable_filter/opr_impl.cpp @@ -78,9 +78,9 @@ void SeparableFilterImpl::separable_filter_exec_8u( megdnn_assert(src.layout.dtype == dtype::Uint8()); Mat kernel_column( - 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr)); + 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr())); Mat kernel_row( - 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr)); + 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr())); size_t src_channels = src.layout.shape[3]; @@ -128,9 +128,9 @@ void SeparableFilterImpl::separable_filter_exec( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y, _megdnn_tensor_out dst) { Mat kernel_column( - 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr)); + 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr())); Mat kernel_row( - 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr)); + 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr())); size_t src_channels = src.layout.shape[3]; T border_value[4] = {0, 0, 0, 0}; diff --git a/dnn/src/arm_common/type_cvt/opr_impl.cpp b/dnn/src/arm_common/type_cvt/opr_impl.cpp index b6eb5f27..831248c4 100644 --- a/dnn/src/arm_common/type_cvt/opr_impl.cpp +++ b/dnn/src/arm_common/type_cvt/opr_impl.cpp @@ -483,18 +483,18 @@ void TypeCvtImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { #undef DISPATCH_QUANTIZED #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define DISPATCH_FLOAT(_stype_enumv, _stype, _dtype_enumv, _dtype, _midout_iv) \ - if (src_dtype.enumv() == DTypeTrait<_stype_enumv>::enumv && \ - dst_dtype.enumv() == DTypeTrait<_dtype_enumv>::enumv) { \ - MIDOUT_BEGIN(megdnn_arm_typecvt_float, midout_iv(_midout_iv)) { \ - using _TypeCvter = FloatTypeCvter<_stype, _dtype>; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_typecvt<_TypeCvter>( \ - reinterpret_cast<_stype*>(src.raw_ptr), \ - reinterpret_cast<_dtype*>(dst.raw_ptr), src_dtype, dst_dtype, \ - nr_elems)); \ - execed = true; \ - } \ - MIDOUT_END(); \ +#define DISPATCH_FLOAT(_stype_enumv, _stype, _dtype_enumv, _dtype, _midout_iv) \ + if (src_dtype.enumv() == DTypeTrait<_stype_enumv>::enumv && \ + dst_dtype.enumv() == DTypeTrait<_dtype_enumv>::enumv) { \ + MIDOUT_BEGIN(megdnn_arm_typecvt_float, midout_iv(_midout_iv)) { \ + using _TypeCvter = FloatTypeCvter<_stype, _dtype>; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_typecvt<_TypeCvter>( \ + reinterpret_cast<_stype*>(src.raw_ptr()), \ + reinterpret_cast<_dtype*>(dst.raw_ptr()), src_dtype, dst_dtype, \ + nr_elems)); \ + execed = true; \ + } \ + MIDOUT_END(); \ } DISPATCH_FLOAT(dt_float16, __fp16, float, float, 0); DISPATCH_FLOAT(float, float, dt_float16, __fp16, 1); diff --git a/dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp b/dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp index 795f81f8..72b8460b 100644 --- a/dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp +++ b/dnn/src/arm_common/warp_perspective/warp_perspective_cv.cpp @@ -167,21 +167,17 @@ void megdnn::arm_common::warp_perspective_cv_exec( megdnn_assert( ch == 1 || ch == 3 || ch == 2, "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); - const float* trans_ptr = trans.ptr(); - const int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { - megdnn_assert(mat_idx.layout.ndim == 1); - midx_ptr = mat_idx.ptr(); - } if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ - if (midx_ptr) { \ - src_id = midx_ptr[batch_id]; \ + const float* trans_ptr = trans.ptr(); \ + if (mat_idx.raw_ptr()) { \ + megdnn_assert(mat_idx.layout.ndim == 1); \ + src_id = mat_idx.ptr()[batch_id]; \ megdnn_assert( \ src_id < src.layout.shape[0], \ "mat_idx out of bound: mat_idx[%zu]=%zu src_batch=%zu", batch_id, \ @@ -202,13 +198,15 @@ void megdnn::arm_common::warp_perspective_cv_exec( #undef cb } else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ - if (midx_ptr) { \ - src_id = midx_ptr[batch_id]; \ + const float* trans_ptr = trans.ptr(); \ + if (mat_idx.raw_ptr()) { \ + megdnn_assert(mat_idx.layout.ndim == 1); \ + src_id = mat_idx.ptr()[batch_id]; \ megdnn_assert( \ src_id < src.layout.shape[0], \ "mat_idx out of bound: mat_idx[%zu]=%zu src_batch=%zu", batch_id, \ diff --git a/dnn/src/armv7/relayout/opr_impl.cpp b/dnn/src/armv7/relayout/opr_impl.cpp index 38f54d10..ae097db2 100644 --- a/dnn/src/armv7/relayout/opr_impl.cpp +++ b/dnn/src/armv7/relayout/opr_impl.cpp @@ -136,10 +136,10 @@ void armv7::RelayoutForwardImpl::exec( relayout::TransposeParam trans_param; bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { - auto sptr = static_cast(src.raw_ptr), - dptr = static_cast(dst.raw_ptr); MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose( - trans_param.batch, trans_param.m, trans_param.n, sptr, dptr)); + trans_param.batch, trans_param.m, trans_param.n, + static_cast(src.raw_ptr()), + static_cast(dst.raw_ptr()))); return; } exec_after_preprocess(src, dst, trans ? &trans_param : nullptr); diff --git a/dnn/src/armv7/rotate/opr_impl.cpp b/dnn/src/armv7/rotate/opr_impl.cpp index 4590a2ee..1d76d3d3 100644 --- a/dnn/src/armv7/rotate/opr_impl.cpp +++ b/dnn/src/armv7/rotate/opr_impl.cpp @@ -288,11 +288,13 @@ void RotateImpl::exec( return fallback::RotateImpl::exec(src, dst, workspace); } + auto clockwise = param().clockwise; + MEGDNN_DISPATCH_CPU_KERN_OPR({ for (size_t i = 0; i < src.layout.shape[0]; ++i) { Mat src_mat = TensorND2Mat(src, i); Mat dst_mat = TensorND2Mat(dst, i); - rotate(src_mat, dst_mat, param().clockwise); + rotate(src_mat, dst_mat, clockwise); } }); } diff --git a/dnn/src/atlas/checksum/opr_impl.cpp b/dnn/src/atlas/checksum/opr_impl.cpp index 880336ad..778a8174 100644 --- a/dnn/src/atlas/checksum/opr_impl.cpp +++ b/dnn/src/atlas/checksum/opr_impl.cpp @@ -36,7 +36,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( megcoreComputingHandle_t comp_handle = handle()->megcore_computing_handle(); megcoreGetDeviceHandle(comp_handle, &dev_handle); megcoreMemcpy( - comp_handle, cpu_data.data(), data.raw_ptr, cpu_data.size(), + comp_handle, cpu_data.data(), data.raw_ptr(), cpu_data.size(), megcoreMemcpyDeviceToHost); megcoreSynchronize(comp_handle); diff --git a/dnn/src/cambricon/checksum/opr_impl.cpp b/dnn/src/cambricon/checksum/opr_impl.cpp index 458eceb5..b16d572f 100644 --- a/dnn/src/cambricon/checksum/opr_impl.cpp +++ b/dnn/src/cambricon/checksum/opr_impl.cpp @@ -62,7 +62,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( check_exec(data.layout, workspace.size); auto queue = cnrt_queue(handle()); - auto ptr = static_cast(data.raw_ptr); + auto ptr = static_cast(data.raw_ptr()); size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t); auto last_val_size = std::min(size_all, 4); cnrt_check(cnrtMemcpyAsync( @@ -72,7 +72,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( auto&& device_info = current_device_info(); bang_c_wrapper( reinterpret_cast(workspace.raw_ptr), - static_cast(data.raw_ptr), size_ints, queue, + static_cast(data.raw_ptr()), size_ints, queue, device_info.core_version); cnrt_check(cnrtMemcpyAsync( &result.checksum, workspace.raw_ptr, sizeof(result.checksum), queue, diff --git a/dnn/src/common/concat_split.cpp b/dnn/src/common/concat_split.cpp index 684a5cd5..6e7ce98b 100644 --- a/dnn/src/common/concat_split.cpp +++ b/dnn/src/common/concat_split.cpp @@ -38,10 +38,9 @@ void ConcatSplitBase::check_layout_common( megdnn_assert_eq_size_t(src.ndim, ndim); } // ensure param().axis is correct - auto errmsg = "param().axis=" + std::to_string(param().axis) + - ", ndim=" + std::to_string(ndim); - MEGDNN_MARK_USED_VAR(errmsg); - megdnn_assert(param().axis < static_cast(ndim), "%s", errmsg.c_str()); + megdnn_assert( + param().axis < static_cast(ndim), "param().axis=%u, ndim=%zu", + param().axis, ndim); // ensure shape size for each axis is correct for (size_t i = 0; i < ndim; ++i) { if (i == static_cast(param().axis)) { diff --git a/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp b/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp index 74d82743..c4a32d7d 100644 --- a/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp +++ b/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp @@ -24,28 +24,24 @@ void ElemwiseMultiTypeImplHelper::exec( _megdnn_in const TensorNDArray& src, _megdnn_tensor_out dst) { switch (m_param.mode) { case Mode::FUSE_MUL_ADD3_INT16x32x32x32: - on_fuse_mul_add3_int16x32x32x32( - make_elemwise_op_param<3>(src, dst), dst.ptr()); + on_fuse_mul_add3_int16x32x32x32(make_elemwise_op_param<3>(src, dst), dst); break; case Mode::FUSE_MUL_ADD3_IXxF32xF32xI8: - on_fuse_mul_add3_iXxf32xf32xi8( - make_elemwise_op_param<3>(src, dst), dst.ptr()); + on_fuse_mul_add3_iXxf32xf32xi8(make_elemwise_op_param<3>(src, dst), dst); break; case Mode::ROUND_SHR_SATURATE_IXxI8xI8: - on_round_shr_saturate_iXxi8xi8( - make_elemwise_op_param<2>(src, dst), dst.ptr()); + on_round_shr_saturate_iXxi8xi8(make_elemwise_op_param<2>(src, dst), dst); break; case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8: on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - make_elemwise_op_param<6>(src, dst), dst.ptr()); + make_elemwise_op_param<6>(src, dst), dst); break; case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8: on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - make_elemwise_op_param<6>(src, dst), dst.ptr()); + make_elemwise_op_param<6>(src, dst), dst); break; case Mode::ROUND_SHR_SATURATE_IXxI8xI16: - on_round_shr_saturate_iXxi8xi16( - make_elemwise_op_param<2>(src, dst), dst.ptr()); + on_round_shr_saturate_iXxi8xi16(make_elemwise_op_param<2>(src, dst), dst); break; ON_QUANTIZED_MODE(RELU, 1); ON_QUANTIZED_MODE(ABS, 1); diff --git a/dnn/src/common/elemwise_multi_type/opr_impl_helper.h b/dnn/src/common/elemwise_multi_type/opr_impl_helper.h index 7496ef7c..72e163b8 100644 --- a/dnn/src/common/elemwise_multi_type/opr_impl_helper.h +++ b/dnn/src/common/elemwise_multi_type/opr_impl_helper.h @@ -33,22 +33,22 @@ class ElemwiseMultiTypeImplHelper : public ElemwiseMultiType, protected: virtual void on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) = 0; + const ElemwiseOpParamN<3>& param, const TensorND& dst) = 0; virtual void on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) = 0; + const ElemwiseOpParamN<3>& param, const TensorND& dst) = 0; virtual void on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) = 0; + const ElemwiseOpParamN<2>& param, const TensorND& dst) = 0; virtual void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0; + const ElemwiseOpParamN<6>& param, const TensorND& dst) = 0; virtual void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0; + const ElemwiseOpParamN<6>& param, const TensorND& dst) = 0; virtual void on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) = 0; + const ElemwiseOpParamN<2>& param, const TensorND& dst) = 0; virtual void on_quantized_mode( const ElemwiseOpParamN<1>& param, const TensorND& dst, diff --git a/dnn/src/common/local/local_def.inl b/dnn/src/common/local/local_def.inl index 79808b42..324f790d 100644 --- a/dnn/src/common/local/local_def.inl +++ b/dnn/src/common/local/local_def.inl @@ -29,9 +29,9 @@ template void local_xcorr_tpl(const LocalKParam& kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; template void local_xcorr_tpl(const LocalKParam& kparam) { - const float* src = static_cast(kparam.src); - const float* filter = static_cast(kparam.filter); - float* dst = static_cast(kparam.dst); + const float* src = static_cast(kparam.src.get_ptr()); + const float* filter = static_cast(kparam.filter.get_ptr()); + float* dst = static_cast(kparam.dst.get_ptr()); float* workspace = static_cast(kparam.workspace); const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh, OW = kparam.ow, FH = kparam.fh, FW = kparam.fw; @@ -191,9 +191,9 @@ template void local_conv_tpl(const LocalKParam& kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; template void local_conv_tpl(const LocalKParam& kparam) { - const float* src = static_cast(kparam.src); - const float* filter = static_cast(kparam.filter); - float* dst = static_cast(kparam.dst); + const float* src = static_cast(kparam.src.get_ptr()); + const float* filter = static_cast(kparam.filter.get_ptr()); + float* dst = static_cast(kparam.dst.get_ptr()); float* workspace = static_cast(kparam.workspace); const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh, OW = kparam.ow, FH = kparam.fh, FW = kparam.fw; diff --git a/dnn/src/common/reduce_helper.h b/dnn/src/common/reduce_helper.h index b72340f6..46fac414 100644 --- a/dnn/src/common/reduce_helper.h +++ b/dnn/src/common/reduce_helper.h @@ -11,9 +11,7 @@ #pragma once #include "megdnn/dtype.h" -#if MEGDNN_CC_HOST #include "megdnn/basic_types.h" -#endif namespace megdnn { namespace reduce { @@ -24,16 +22,14 @@ struct SumOp { const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { - return lhs + rhs; - } - MEGDNN_HOST MEGDNN_DEVICE SumOp(src_ctype* src, dst_ctype* dst, size_t B) + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; } + SumOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(0)), src(src), dst(dst), B(B) {} }; @@ -43,18 +39,16 @@ struct MeanOp { const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { - dst[idx] = val / static_cast(B); - } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { - return lhs + rhs; + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { + dst.ptr()[idx] = val / static_cast(B); } - MEGDNN_HOST MEGDNN_DEVICE MeanOp(src_ctype* src, dst_ctype* dst, size_t B) + static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; } + MeanOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(0)), src(src), dst(dst), B(B) {} }; @@ -64,18 +58,17 @@ struct SumSqrOp { const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { - return static_cast(src[idx]) * static_cast(src[idx]); + wtype read(uint32_t idx) { + return static_cast(src.ptr()[idx]) * + static_cast(src.ptr()[idx]); } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { - return lhs + rhs; - } - MEGDNN_HOST MEGDNN_DEVICE SumSqrOp(src_ctype* src, dst_ctype* dst, size_t B) + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return lhs + rhs; } + SumSqrOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(0)), src(src), dst(dst), B(B) {} }; @@ -84,16 +77,14 @@ struct ProdOp { typedef wtype_ wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { - return lhs * rhs; - } - MEGDNN_HOST MEGDNN_DEVICE ProdOp(src_ctype* src, dst_ctype* dst, size_t B) + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return lhs * rhs; } + ProdOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(1)), src(src), dst(dst), B(B) {} }; @@ -102,20 +93,14 @@ struct MinOp { typedef wtype_ wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { -#if defined(__CUDA_ARCH__) - return lhs < rhs ? lhs : rhs; -#else - return std::min(lhs, rhs); -#endif - } - MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B) + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return std::min(lhs, rhs); } + MinOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(DTypeTrait::max())), src(src), dst(dst), B(B) {} }; @@ -124,20 +109,16 @@ struct MinOp { typedef dt_float32 wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { -#if defined(__CUDA_ARCH__) - return (isnan(lhs) || lhs < rhs) ? lhs : rhs; -#else + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return (std::isnan(lhs) || lhs < rhs) ? lhs : rhs; -#endif } - MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B) + MinOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(DTypeTrait::max())), src(src), dst(dst), B(B) {} }; @@ -146,20 +127,14 @@ struct MaxOp { typedef wtype_ wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { -#if defined(__CUDA_ARCH__) - return lhs > rhs ? lhs : rhs; -#else - return std::max(lhs, rhs); -#endif - } - MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B) + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return std::max(lhs, rhs); } + MaxOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(DTypeTrait::min())), src(src), dst(dst), B(B) {} }; @@ -168,20 +143,16 @@ struct MaxOp { typedef dt_float32 wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { -#if defined(__CUDA_ARCH__) - return (isnan(lhs) || lhs > rhs) ? lhs : rhs; -#else + wtype read(uint32_t idx) { return src.ptr()[idx]; } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return (std::isnan(lhs) || lhs > rhs) ? lhs : rhs; -#endif } - MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B) + MaxOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(DTypeTrait::min())), src(src), dst(dst), B(B) {} }; @@ -190,28 +161,19 @@ struct CheckNonFiniteOp { typedef wtype_ wtype; const wtype INIT; - src_ctype* src; - dst_ctype* dst; + RefPtr src; + RefPtr dst; const size_t B; - MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { -#if defined(__CUDA_ARCH__) - return !isfinite(src[idx]); -#else - return !std::isfinite(src[idx]); -#endif - } - MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } - static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { - return lhs | rhs; - } - MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp(src_ctype* src, dst_ctype* dst, size_t B) + wtype read(uint32_t idx) { return !std::isfinite(src.ptr()[idx]); } + void write(uint32_t idx, wtype val) { dst.ptr()[idx] = val; } + static wtype apply(wtype lhs, wtype rhs) { return lhs | rhs; } + MEGDNN_HOST MEGDNN_DEVICE + CheckNonFiniteOp(const RefPtr& src, const RefPtr& dst, size_t B) : INIT(wtype(0)), src(src), dst(dst), B(B) {} }; -#if MEGDNN_CC_HOST void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, size_t axis); -#endif } // namespace reduce } // namespace megdnn diff --git a/dnn/src/common/reduce_helper_device.h b/dnn/src/common/reduce_helper_device.h new file mode 100644 index 00000000..31ceb194 --- /dev/null +++ b/dnn/src/common/reduce_helper_device.h @@ -0,0 +1,222 @@ +/** + * \file dnn/src/common/reduce_helper_device.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/dtype.h" + +#if MEGDNN_CC_HOST +#include "megdnn/basic_types.h" +#endif + +namespace megdnn { +namespace device_reduce { + +template +struct SumOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE SumOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct MeanOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val / static_cast(B); + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE MeanOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct SumSqrOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { + return static_cast(src[idx]) * static_cast(src[idx]); + } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE SumSqrOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct ProdOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs * rhs; + } + MEGDNN_HOST MEGDNN_DEVICE ProdOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(1)), src(src), dst(dst), B(B) {} +}; + +template +struct MinOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return lhs < rhs ? lhs : rhs; +#else + return std::min(lhs, rhs); +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::max())), src(src), dst(dst), B(B) {} +}; + +template +struct MinOp { + typedef dt_float32 wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return (isnan(lhs) || lhs < rhs) ? lhs : rhs; +#else + return (std::isnan(lhs) || lhs < rhs) ? lhs : rhs; +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::max())), src(src), dst(dst), B(B) {} +}; + +template +struct MaxOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return lhs > rhs ? lhs : rhs; +#else + return std::max(lhs, rhs); +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::min())), src(src), dst(dst), B(B) {} +}; + +template +struct MaxOp { + typedef dt_float32 wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return (isnan(lhs) || lhs > rhs) ? lhs : rhs; +#else + return (std::isnan(lhs) || lhs > rhs) ? lhs : rhs; +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::min())), src(src), dst(dst), B(B) {} +}; + +template +struct CheckNonFiniteOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { +#if defined(__CUDA_ARCH__) + return !isfinite(src[idx]); +#else + return !std::isfinite(src[idx]); +#endif + } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs | rhs; + } + MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +} // namespace device_reduce + +namespace reduce { +#if MEGDNN_CC_HOST +void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, size_t axis); +#endif +} // namespace reduce + +} // namespace megdnn + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h index b47f3df0..8f6dce9b 100644 --- a/dnn/src/common/utils.h +++ b/dnn/src/common/utils.h @@ -362,6 +362,10 @@ static inline void copy_plane_in_bytes( megcoreDeviceHandle_t get_device_handle(Handle* handle); +static inline void incr_refp(RefPtr& ptr, ptrdiff_t delta) { + ptr += (size_t)delta; +} + static inline void incr_voidp(void*& ptr, ptrdiff_t delta) { ptr = reinterpret_cast(reinterpret_cast(ptr) + delta); } @@ -674,7 +678,8 @@ struct CompTypeCvter { comp.layout.dtype.enumv() != DTypeTrait::enumv) { comp.layout.dtype = CompType(); comp.layout.init_contiguous_stride(); - comp.raw_ptr = m_workspace_bundle->get(m_workspace_idx++); + comp = TensorND{ + m_workspace_bundle->get(m_workspace_idx++), comp.layout}; if (src.layout.ndim) { m_cvt_opr->exec(src, comp); } @@ -699,7 +704,7 @@ struct CompTypeCvter { * \brief get TensorND raw_ptr+low_byte pointer. */ inline dt_byte* get_low_ptr(const TensorND* tensor) { - return static_cast(tensor->raw_ptr) + tensor->layout.span().low_byte; + return static_cast(tensor->raw_ptr()) + tensor->layout.span().low_byte; } /*! diff --git a/dnn/src/cuda/argmxx/opr_impl.cpp b/dnn/src/cuda/argmxx/opr_impl.cpp index 78337028..4decd178 100644 --- a/dnn/src/cuda/argmxx/opr_impl.cpp +++ b/dnn/src/cuda/argmxx/opr_impl.cpp @@ -11,7 +11,7 @@ #include "src/cuda/argmxx/opr_impl.h" #include "src/common/argmxx_helper.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "src/cuda/reduce_helper.cuh" #include "src/cuda/utils.h" diff --git a/dnn/src/cuda/batch_normalization/opr_impl.cpp b/dnn/src/cuda/batch_normalization/opr_impl.cpp index 999b9896..8deac1a1 100644 --- a/dnn/src/cuda/batch_normalization/opr_impl.cpp +++ b/dnn/src/cuda/batch_normalization/opr_impl.cpp @@ -117,32 +117,34 @@ void BNForwardImpl::exec( #if CUDNN_VERSION >= 7410 cudnn_check(cudnnBatchNormalizationForwardTrainingEx( handle, tensor_desc.bn_mode, CUDNN_BATCHNORM_OPS_BN, &alpha, - &beta, // one & zero - tensor_desc.xy_desc.desc, src.raw_ptr, // xDesc & x - nullptr, nullptr, // zDesc & z - tensor_desc.xy_desc.desc, dst.raw_ptr, // yDesc & y - tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc - bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, mean.raw_ptr, - variance.raw_ptr, m_param.epsilon, batch_mean.raw_ptr, - batch_inv_variance.raw_ptr, nullptr, workspace.raw_ptr, - workspace.size, reserve.raw_ptr, reserve.layout.access_bytes())); + &beta, // one & zero + tensor_desc.xy_desc.desc, src.raw_ptr(), // xDesc & x + nullptr, nullptr, // zDesc & z + tensor_desc.xy_desc.desc, dst.raw_ptr(), // yDesc & y + tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc + bn_scale.raw_ptr(), bn_bias.raw_ptr(), m_param.avg_factor, + mean.raw_ptr(), variance.raw_ptr(), m_param.epsilon, + batch_mean.raw_ptr(), batch_inv_variance.raw_ptr(), nullptr, + workspace.raw_ptr, workspace.size, reserve.raw_ptr(), + reserve.layout.access_bytes())); #else cudnn_check(cudnnBatchNormalizationForwardTraining( handle, tensor_desc.bn_mode, &alpha, &beta, - tensor_desc.xy_desc.desc, src.raw_ptr, // xDesc & x - tensor_desc.xy_desc.desc, dst.raw_ptr, // yDesc & y - tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc - bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, mean.raw_ptr, - variance.raw_ptr, m_param.epsilon, batch_mean.raw_ptr, - batch_inv_variance.raw_ptr)); + tensor_desc.xy_desc.desc, src.raw_ptr(), // xDesc & x + tensor_desc.xy_desc.desc, dst.raw_ptr(), // yDesc & y + tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc + bn_scale.raw_ptr(), bn_bias.raw_ptr(), m_param.avg_factor, + mean.raw_ptr(), variance.raw_ptr(), m_param.epsilon, + batch_mean.raw_ptr(), batch_inv_variance.raw_ptr())); #endif // CUDNN_VERSION >= 7410 break; case param::BN::FwdMode::INFERENCE: cudnn_check(cudnnBatchNormalizationForwardInference( handle, tensor_desc.bn_mode, &alpha, &beta, - tensor_desc.xy_desc.desc, src.raw_ptr, tensor_desc.xy_desc.desc, - dst.raw_ptr, tensor_desc.param_desc.desc, bn_scale.raw_ptr, - bn_bias.raw_ptr, mean.raw_ptr, variance.raw_ptr, m_param.epsilon)); + tensor_desc.xy_desc.desc, src.raw_ptr(), tensor_desc.xy_desc.desc, + dst.raw_ptr(), tensor_desc.param_desc.desc, bn_scale.raw_ptr(), + bn_bias.raw_ptr(), mean.raw_ptr(), variance.raw_ptr(), + m_param.epsilon)); break; default: megdnn_throw("Unknown forward mode type of batch normalization."); @@ -198,27 +200,27 @@ void BNBackwardImpl::exec( cudnn_check(cudnnBatchNormalizationBackwardEx( handle, tensor_desc.bn_mode, CUDNN_BATCHNORM_OPS_BN, &alpha, &beta, &alpha, &beta, tensor_desc.xy_desc.desc, - x.raw_ptr, // xDesc & x - nullptr, nullptr, // yDesc & y - tensor_desc.xy_desc.desc, dy.raw_ptr, // dyDesc & dy - nullptr, nullptr, // dzDesc & dz - tensor_desc.xy_desc.desc, dx.raw_ptr, // dxDesc & dx - tensor_desc.param_desc.desc, bn_scale.raw_ptr, // bnScale - nullptr, // bnBias - d_bn_scale.raw_ptr, d_bn_bias.raw_ptr, // dScale, dBias - m_param.epsilon, saved_batch_mean.raw_ptr, saved_batch_inv_variance.raw_ptr, - nullptr, workspace.raw_ptr, workspace.size, reserve.raw_ptr, - reserve.layout.access_bytes())); + x.raw_ptr(), // xDesc & x + nullptr, nullptr, // yDesc & y + tensor_desc.xy_desc.desc, dy.raw_ptr(), // dyDesc & dy + nullptr, nullptr, // dzDesc & dz + tensor_desc.xy_desc.desc, dx.raw_ptr(), // dxDesc & dx + tensor_desc.param_desc.desc, bn_scale.raw_ptr(), // bnScale + nullptr, // bnBias + d_bn_scale.raw_ptr(), d_bn_bias.raw_ptr(), // dScale, dBias + m_param.epsilon, saved_batch_mean.raw_ptr(), + saved_batch_inv_variance.raw_ptr(), nullptr, workspace.raw_ptr, + workspace.size, reserve.raw_ptr(), reserve.layout.access_bytes())); #else cudnn_check(cudnnBatchNormalizationBackward( handle, tensor_desc.bn_mode, &alpha, &beta, &alpha, &beta, - tensor_desc.xy_desc.desc, x.raw_ptr, // xDesc & x - tensor_desc.xy_desc.desc, dy.raw_ptr, // dyDesc & dy - tensor_desc.xy_desc.desc, dx.raw_ptr, // dxDesc & dx - tensor_desc.param_desc.desc, bn_scale.raw_ptr, // bnScale - d_bn_scale.raw_ptr, d_bn_bias.raw_ptr, // dScale, dBias - m_param.epsilon, saved_batch_mean.raw_ptr, - saved_batch_inv_variance.raw_ptr)); + tensor_desc.xy_desc.desc, x.raw_ptr(), // xDesc & x + tensor_desc.xy_desc.desc, dy.raw_ptr(), // dyDesc & dy + tensor_desc.xy_desc.desc, dx.raw_ptr(), // dxDesc & dx + tensor_desc.param_desc.desc, bn_scale.raw_ptr(), // bnScale + d_bn_scale.raw_ptr(), d_bn_bias.raw_ptr(), // dScale, dBias + m_param.epsilon, saved_batch_mean.raw_ptr(), + saved_batch_inv_variance.raw_ptr())); #endif } diff --git a/dnn/src/cuda/batched_matrix_mul/brute_force.cpp b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp index 5f6e163f..eddcf10a 100644 --- a/dnn/src/cuda/batched_matrix_mul/brute_force.cpp +++ b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp @@ -80,9 +80,9 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec(const ExecArgs& args) con rep(n, N) { TensorND A_, B_, C_; auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { - out.raw_ptr = static_cast( - static_cast(in.raw_ptr) + - n * in.layout.stride[0] * in.layout.dtype.size()); + out.reset_ptr(static_cast( + static_cast(in.raw_ptr()) + + n * in.layout.stride[0] * in.layout.dtype.size())); out.layout = in.layout.remove_axis(0); }; tensor_n_from_batch(args.tensor_a, A_); diff --git a/dnn/src/cuda/batched_matrix_mul/cublas.cpp b/dnn/src/cuda/batched_matrix_mul/cublas.cpp index 7a1a1e43..43885826 100644 --- a/dnn/src/cuda/batched_matrix_mul/cublas.cpp +++ b/dnn/src/cuda/batched_matrix_mul/cublas.cpp @@ -76,13 +76,13 @@ void BatchedMatrixMulForwardImpl::AlgoCublas::exec(const ExecArgs& args) const { static_cast(workspace.raw_ptr + 2 * batch * sizeof(uintptr_t))); arange( - As, reinterpret_cast(args.tensor_a.raw_ptr), + As, reinterpret_cast(args.tensor_a.raw_ptr()), args.layout_a.stride[0] * dtype.size(), batch, stream); arange( - Bs, reinterpret_cast(args.tensor_b.raw_ptr), + Bs, reinterpret_cast(args.tensor_b.raw_ptr()), args.layout_b.stride[0] * dtype.size(), batch, stream); arange( - Cs, reinterpret_cast(args.tensor_c.raw_ptr), + Cs, reinterpret_cast(args.tensor_c.raw_ptr()), args.layout_c.stride[0] * dtype.size(), batch, stream); auto io32_c32 = [&]() { diff --git a/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp b/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp index b7a56da1..c30647cd 100644 --- a/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp +++ b/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp @@ -62,10 +62,10 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const "workspace bundle size should be 1(ws_algo)"); cublas_check(cublasLtMatmul( cublasLt_handle, desc.matmul_desc, one_half, - static_cast(args.tensor_b.raw_ptr), desc.layout_b, - static_cast(args.tensor_a.raw_ptr), desc.layout_a, - zero_half, static_cast(args.tensor_c.raw_ptr), - desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr), + static_cast(args.tensor_b.raw_ptr()), desc.layout_b, + static_cast(args.tensor_a.raw_ptr()), desc.layout_a, + zero_half, static_cast(args.tensor_c.raw_ptr()), + desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr()), desc.layout_c, &algo, ws_bundle.get(0), ws_bundle.get_size(0), stream)); }; auto batched_sgemm = [&]() { @@ -77,7 +77,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const auto dev_a = (desc.dt_a == CUDA_R_16F) ? static_cast(args.tensor_a.ptr()) : static_cast(args.tensor_a.ptr()); - auto dev_c = static_cast(args.tensor_c.raw_ptr); + auto dev_c = static_cast(args.tensor_c.raw_ptr()); megdnn_assert( ws_bundle.nr_workspace() == 1, "workspace bundle size should be 1(ws_algo)"); @@ -104,14 +104,14 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &pm, sizeof(pm))); cublas_check(cublasLtMatrixTransform( - cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr, + cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr(), desc.layout_b, zero, nullptr, nullptr, ws_b, desc.layout_trans_b, stream)); cublas_check(cublasLtMatrixTransformDescSetAttribute( transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_a, sizeof(trans_a))); cublas_check(cublasLtMatrixTransform( - cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr, + cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr(), desc.layout_a, zero, nullptr, nullptr, ws_a, desc.layout_trans_a, stream)); cublas_check(cublasLtMatmul( @@ -124,7 +124,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(const ExecArgs& args) const sizeof(trans_c))); cublas_check(cublasLtMatrixTransform( cublasLt_handle, transform_desc, one, ws_c, desc.layout_trans_c, zero, - nullptr, nullptr, args.tensor_c.raw_ptr, desc.layout_c, stream)); + nullptr, nullptr, args.tensor_c.raw_ptr(), desc.layout_c, stream)); cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); }; diff --git a/dnn/src/cuda/check_non_finite/kern.cu b/dnn/src/cuda/check_non_finite/kern.cu index 6692e08d..5cd9f1d8 100644 --- a/dnn/src/cuda/check_non_finite/kern.cu +++ b/dnn/src/cuda/check_non_finite/kern.cu @@ -8,7 +8,7 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "megdnn/dtype.h" #include "src/cuda/reduce_helper.cuh" @@ -18,7 +18,9 @@ namespace cuda { #define COMMA , -INST_REDUCE(reduce::CheckNonFiniteOp, false); +INST_REDUCE( + device_reduce::CheckNonFiniteOp, + false); #undef COMMA } // namespace cuda diff --git a/dnn/src/cuda/check_non_finite/opr_impl.cpp b/dnn/src/cuda/check_non_finite/opr_impl.cpp index 47214983..94657921 100644 --- a/dnn/src/cuda/check_non_finite/opr_impl.cpp +++ b/dnn/src/cuda/check_non_finite/opr_impl.cpp @@ -15,12 +15,12 @@ #include "src/cuda/handle.h" #include "src/cuda/utils.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" namespace megdnn { namespace cuda { -using reduce::CheckNonFiniteOp; +using device_reduce::CheckNonFiniteOp; size_t CheckNonFiniteImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& dst) { diff --git a/dnn/src/cuda/checksum/opr_impl.cpp b/dnn/src/cuda/checksum/opr_impl.cpp index 63926bea..1eb29a02 100644 --- a/dnn/src/cuda/checksum/opr_impl.cpp +++ b/dnn/src/cuda/checksum/opr_impl.cpp @@ -45,7 +45,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( check_exec(data.layout, workspace.size); auto stream = cuda_stream(handle()); - auto ptr = static_cast(data.raw_ptr); + auto ptr = static_cast(data.raw_ptr()); size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t); auto last_val_size = std::min(size_all, 4); cuda_check(cudaMemcpyAsync( @@ -54,7 +54,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( if (size_ints) { checksum::calc( static_cast(wbundle.get(1)), - static_cast(data.raw_ptr), + static_cast(data.raw_ptr()), static_cast(wbundle.get(0)), size_ints, stream); cuda_check(cudaMemcpyAsync( &result.checksum, wbundle.get(1), sizeof(result.checksum), diff --git a/dnn/src/cuda/conv_bias/batched_matmul.cpp b/dnn/src/cuda/conv_bias/batched_matmul.cpp index 5ca12154..46a88fe0 100644 --- a/dnn/src/cuda/conv_bias/batched_matmul.cpp +++ b/dnn/src/cuda/conv_bias/batched_matmul.cpp @@ -135,9 +135,9 @@ size_t ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const { auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(1); + conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -150,9 +150,9 @@ void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const { { auto config = prepare_sub_opr(args); - TensorND A{args.filter_tensor->raw_ptr, config.first[0]}, - B{args.src_tensor->raw_ptr, config.first[1]}, - C{args.dst_tensor->raw_ptr, config.first[2]}; + TensorND A{args.filter_tensor->raw_ptr(), config.first[0]}, + B{args.src_tensor->raw_ptr(), config.first[1]}, + C{args.dst_tensor->raw_ptr(), config.first[2]}; config.second->exec(A, B, C, bundle.get_workspace(0)); } handle_bias_and_nonlinear( diff --git a/dnn/src/cuda/conv_bias/chanwise.cpp b/dnn/src/cuda/conv_bias/chanwise.cpp index a2ee6d5c..c20a2645 100644 --- a/dnn/src/cuda/conv_bias/chanwise.cpp +++ b/dnn/src/cuda/conv_bias/chanwise.cpp @@ -52,9 +52,9 @@ size_t ConvBiasForwardImpl::AlgoChanwise::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const { WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}}; - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -74,9 +74,9 @@ void ConvBiasForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const { #if CUDA_VERSION >= 9000 if (is_compute_capability_required(5, 3)) { chanwise::run_fwd( - static_cast(conv_dst_tensor.raw_ptr), - static_cast(args.src_tensor->raw_ptr), - static_cast(args.filter_tensor->raw_ptr), kparam, + static_cast(conv_dst_tensor.raw_ptr()), + static_cast(args.src_tensor->raw_ptr()), + static_cast(args.filter_tensor->raw_ptr()), kparam, stream); } else { chanwise::run_fwd( diff --git a/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp index 5e5560b7..81da20f2 100644 --- a/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp +++ b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp @@ -50,9 +50,9 @@ size_t ConvBiasForwardImpl::AlgoChanwise8x8x32::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoChanwise8x8x32::exec(const ExecArgs& args) const { WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}}; - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, diff --git a/dnn/src/cuda/conv_bias/chanwise_small.cpp b/dnn/src/cuda/conv_bias/chanwise_small.cpp index b2097ca9..7cd01cd0 100644 --- a/dnn/src/cuda/conv_bias/chanwise_small.cpp +++ b/dnn/src/cuda/conv_bias/chanwise_small.cpp @@ -65,9 +65,9 @@ size_t ConvBiasForwardImpl::AlgoChanwiseSmall::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) const { WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}}; - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor = TensorND{bundle.get(0), conv_dst_tensor.layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -85,9 +85,9 @@ void ConvBiasForwardImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) const { #if CUDA_VERSION >= 9000 case DTypeEnum::Float16: chanwise::run_fwd_small( - static_cast(conv_dst_tensor.raw_ptr), - static_cast(args.src_tensor->raw_ptr), - static_cast(args.filter_tensor->raw_ptr), kparam, + static_cast(conv_dst_tensor.raw_ptr()), + static_cast(args.src_tensor->raw_ptr()), + static_cast(args.filter_tensor->raw_ptr()), kparam, stream); break; #endif diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp index ae0d3450..5b735b9d 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp @@ -100,9 +100,9 @@ size_t ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(1); + conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -120,10 +120,10 @@ void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionForward( conv_args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - conv_args.src_tensor->raw_ptr, D.filter_desc.desc, - conv_args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum, + conv_args.src_tensor->raw_ptr(), D.filter_desc.desc, + conv_args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, conv_workspace.raw_ptr, conv_workspace.size, &beta, D.dst_desc.desc, - conv_args.dst_tensor->raw_ptr); + conv_args.dst_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s", cudnnGetErrorString(status), conv_args.to_string().c_str()); diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index cfe23fc3..9478262b 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -231,7 +231,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( auto workspace_ptr = args.workspace.raw_ptr; auto workspace_size = args.workspace.size; - auto bias_ptr = args.bias_tensor->raw_ptr; + auto bias_ptr = args.bias_tensor->raw_ptr(); if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && args.src_layout->dtype.category() != DTypeCategory::FLOAT) { auto cvt = args.handle->create_operator(); @@ -242,7 +242,7 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( auto bias_size_in_bytes = float_bias_layout.span().dist_byte(); megdnn_assert(args.workspace.size >= bias_size_in_bytes); cvt->exec( - {args.bias_tensor->raw_ptr, converted_bias_layout}, + {args.bias_tensor->raw_ptr(), converted_bias_layout}, TensorND{workspace_ptr, float_bias_layout}); bias_ptr = workspace_ptr; @@ -254,19 +254,19 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( if (args.z_layout->ndim == 0) { status = cudnnConvolutionBiasActivationForward( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.filter_desc.desc, - args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum, + args.src_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, workspace_ptr, workspace_size, &beta, D.dst_desc.desc, - args.dst_tensor->raw_ptr, D.bias_desc.desc, bias_ptr, - D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr); + args.dst_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr, + D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr()); } else { status = cudnnConvolutionBiasActivationForward( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.filter_desc.desc, - args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, m_cudnn_enum, + args.src_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, workspace_ptr, workspace_size, &beta, D.z_desc.desc, - args.z_tensor->raw_ptr, D.bias_desc.desc, bias_ptr, - D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr); + args.z_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr, + D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr()); } megdnn_assert( diff --git a/dnn/src/cuda/conv_bias/group_conv.cpp b/dnn/src/cuda/conv_bias/group_conv.cpp index 1366c64f..f456ba88 100644 --- a/dnn/src/cuda/conv_bias/group_conv.cpp +++ b/dnn/src/cuda/conv_bias/group_conv.cpp @@ -142,9 +142,10 @@ size_t ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const { auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor = TensorND{ + bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -156,11 +157,11 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const sub_args.dst_layout = &conv_dst_tensor.layout; auto config = prepare_sub_opr(sub_args); - TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; - TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]}; - TensorND tbias{args.bias_tensor->raw_ptr, config.first[2]}; - TensorND tz{args.z_tensor->raw_ptr, config.first[3]}; - TensorND tdst{conv_dst_tensor.raw_ptr, config.first[4]}; + TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]}; + TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[1]}; + TensorND tbias{args.bias_tensor->raw_ptr(), config.first[2]}; + TensorND tz{args.z_tensor->raw_ptr(), config.first[3]}; + TensorND tdst{conv_dst_tensor.raw_ptr(), config.first[4]}; size_t c_pos; if (args.filter_meta.format == Param::Format::NCHW || @@ -187,9 +188,9 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) const for (uint32_t g = 0; g < grp; ++g) { config.second->exec( tsrc, tfilter, tbias, tz, tdst, nullptr, bundle.get_workspace(0)); - incr_voidp(tsrc.raw_ptr, strd_src); - incr_voidp(tdst.raw_ptr, strd_dst); - incr_voidp(tfilter.raw_ptr, strd_flt); + incr_refp(tsrc.get_ref_ptr(), strd_src); + incr_refp(tdst.get_ref_ptr(), strd_dst); + incr_refp(tfilter.get_ref_ptr(), strd_flt); } } handle_bias_and_nonlinear( diff --git a/dnn/src/cuda/conv_bias/helper.cpp b/dnn/src/cuda/conv_bias/helper.cpp index 0c884116..8953707d 100644 --- a/dnn/src/cuda/conv_bias/helper.cpp +++ b/dnn/src/cuda/conv_bias/helper.cpp @@ -189,19 +189,19 @@ SmallVector matmul_get_workspace_bundle(const BiasForwardSizeArgs& args) } void flip_filter( - const BiasForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) { + const BiasForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) { auto&& fm = args.filter_meta; megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; auto dtype = fm.dtype; megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); - TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, + TensorND src{{{OC, IC, FH, FW}, dtype}, ref_ptr}, dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; dst.layout.stride[2] = -dst.layout.stride[2]; dst.layout.stride[3] = -dst.layout.stride[3]; args.handle->relayout_opr()->exec(src, dst); - raw_ptr = workspace.raw_ptr; + ref_ptr.reset(workspace.raw_ptr); } } // namespace conv_bias diff --git a/dnn/src/cuda/conv_bias/helper.h b/dnn/src/cuda/conv_bias/helper.h index 9c141acf..f3d38c5c 100644 --- a/dnn/src/cuda/conv_bias/helper.h +++ b/dnn/src/cuda/conv_bias/helper.h @@ -58,7 +58,7 @@ SmallVector matmul_get_workspace_bundle(const BiasForwardSizeArgs& args) * change \p raw_ptr to workspace. */ void flip_filter( - const BiasForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr); + const BiasForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr); struct CUDNNForwardDescs { TensorDesc src_desc, dst_desc, bias_desc, z_desc; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp index 24c2e404..4a6b6362 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp @@ -39,7 +39,7 @@ SmallVector ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGem void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { megdnn_assert(args.preprocessed_filter->tensors.size() == 1); - void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); reorder_filter(args, filter_ptr); } @@ -48,12 +48,12 @@ std::tuple ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm void* filter_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 1); - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); reorder_filter(args, filter_ptr); } - void* bias_ptr = args.bias_tensor->raw_ptr; + void* bias_ptr = args.bias_tensor->raw_ptr(); return {filter_ptr, bias_ptr}; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp index c4db640d..6616d27d 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp @@ -39,7 +39,7 @@ SmallVector ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm: void ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { megdnn_assert(args.preprocessed_filter->tensors.size() == 1); - void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); reorder_filter(args, m_algo_param.access_size, filter_ptr); } @@ -48,12 +48,12 @@ std::tuple ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm:: void* filter_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 1); - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); reorder_filter(args, m_algo_param.access_size, filter_ptr); } - void* bias_ptr = args.bias_tensor->raw_ptr; + void* bias_ptr = args.bias_tensor->raw_ptr(); return {filter_ptr, bias_ptr}; } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp index 227339a3..7a2d672b 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp @@ -103,7 +103,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args); if (args.z_layout->ndim > 0) - z_ptr = args.z_tensor->raw_ptr; + z_ptr = args.z_tensor->raw_ptr(); // \note these constants of cutlass epilogue will be passed to method // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, @@ -131,8 +131,8 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( - op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, - args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, + op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr, + args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold, &dst_scale, stream, &src_zero); @@ -159,7 +159,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::reorder_filter( // filter: KCRS64 => CRSK64 and reorder oc cutlass_wrapper::reorder_ncxhwx_imma_filter<4, 64>( reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, fw, + reinterpret_cast(args.filter_tensor->raw_ptr()), co, ci, fh, fw, true, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp index b25d2690..aa3485cd 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp @@ -115,7 +115,7 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args); if (args.z_layout->ndim > 0) - z_ptr = args.z_tensor->raw_ptr; + z_ptr = args.z_tensor->raw_ptr(); // \note these constants of cutlass epilogue will be passed to method // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, @@ -151,8 +151,8 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( - op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, - args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, + op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr, + args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold, &dst_scale, stream, &src_zero); @@ -188,7 +188,7 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter( cutlass_wrapper::reorder_nhwc_imma_filter<4>( reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, fw, + reinterpret_cast(args.filter_tensor->raw_ptr()), co, ci, fh, fw, trans_oc, alignbits, oc_iterleaved, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp index a925efb9..8080086d 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp @@ -158,18 +158,15 @@ void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::exec( UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); // reorder filter { - TensorLayout in = *(args.filter_layout); - TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype}; + TensorLayout out = { + {ci / 16, 4, fh, fw, co, 4}, args.filter_tensor->layout.dtype}; out.stride[0] = 16 * co * fh * fw; out.stride[1] = 4; out.stride[2] = fw * co * 16; out.stride[3] = co * 16; out.stride[4] = 16; out.stride[5] = 1; - TensorND ts_in, ts_out; - ts_in.layout = in, ts_out.layout = out; - ts_in.raw_ptr = args.filter_tensor->raw_ptr, - ts_out.raw_ptr = args.workspace.raw_ptr; + TensorND ts_in = *args.filter_tensor, ts_out{args.workspace.raw_ptr, out}; args.opr->handle()->create_operator()->exec(ts_in, ts_out); } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp index ac4666c7..1e707076 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp @@ -160,18 +160,15 @@ void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::exec( UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), param); // reorder filter { - TensorLayout in = *(args.filter_layout); - TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype}; + TensorLayout out = { + {ci / 16, 4, fh, fw, co, 4}, args.filter_tensor->layout.dtype}; out.stride[0] = 16 * co * fh * fw; out.stride[1] = 4; out.stride[2] = fw * co * 16; out.stride[3] = co * 16; out.stride[4] = 16; out.stride[5] = 1; - TensorND ts_in, ts_out; - ts_in.layout = in, ts_out.layout = out; - ts_in.raw_ptr = args.filter_tensor->raw_ptr, - ts_out.raw_ptr = args.workspace.raw_ptr; + TensorND ts_in = *args.filter_tensor, ts_out{args.workspace.raw_ptr, out}; args.opr->handle()->create_operator()->exec(ts_in, ts_out); } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 2bfdb0ea..9d96c554 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -125,11 +125,11 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( filter_ptr = reinterpret_cast(args.workspace.raw_ptr); // filter: KCRS32 => CRSK32 and reorder oc cutlass_wrapper::reorder_ncxhwx_imma_filter<8, 32>( - filter_ptr, reinterpret_cast(args.filter_tensor->raw_ptr), co, - ci, fh, fw, trans_oc, stream); + filter_ptr, reinterpret_cast(args.filter_tensor->raw_ptr()), + co, ci, fh, fw, trans_oc, stream); } else { - filter_ptr = - reinterpret_cast(args.preprocessed_filter->tensors[0].raw_ptr); + filter_ptr = reinterpret_cast( + args.preprocessed_filter->tensors[0].raw_ptr()); } float src_scale = args.src_layout->dtype.param().scale, @@ -157,9 +157,9 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( - op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, - z_dev_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, - wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, + op, args.src_tensor->raw_ptr(), filter_ptr, args.bias_tensor->raw_ptr(), + z_dev_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, + ho, wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold, &dst_scale, stream); after_kernel_launch(); @@ -204,8 +204,8 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( cudaStream_t stream = cuda_stream(args.opr->handle()); // filter: KCRS32 => CRSK32 and reorder oc cutlass_wrapper::reorder_ncxhwx_imma_filter<8, 32>( - reinterpret_cast(args.preprocessed_filter->tensors[0].raw_ptr), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, fw, + reinterpret_cast(args.preprocessed_filter->tensors[0].raw_ptr()), + reinterpret_cast(args.filter_tensor->raw_ptr()), co, ci, fh, fw, trans_oc, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index e9ee4822..7173bec4 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -155,16 +155,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = args.workspace.raw_ptr; - ts_dst.layout = dst; + TensorND ts_src{args.filter_tensor->raw_ptr(), src}, + ts_dst{args.workspace.raw_ptr, dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); } else { - filter_ptr = - reinterpret_cast(args.preprocessed_filter->tensors[0].raw_ptr); + filter_ptr = reinterpret_cast( + args.preprocessed_filter->tensors[0].raw_ptr()); } float src_scale = args.src_layout->dtype.param().scale, @@ -190,7 +187,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float delta = 0.f; void* z_ptr = nullptr; if (args.z_layout->ndim > 0) { - z_ptr = args.z_tensor->raw_ptr; + z_ptr = args.z_tensor->raw_ptr(); gamma = 1.f; if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) { megdnn_assert( @@ -213,10 +210,10 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( - op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, z_ptr, - args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, - pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold, - &dst_scale, stream); + op, args.src_tensor->raw_ptr(), filter_ptr, args.bias_tensor->raw_ptr(), + z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, + wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, + &threshold, &dst_scale, stream); after_kernel_launch(); } @@ -261,11 +258,8 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - ts_dst.layout = dst; + TensorND ts_src{args.filter_tensor->raw_ptr(), src}, + ts_dst{args.preprocessed_filter->tensors[0].raw_ptr(), dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp index f6a89f81..70856f6c 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp @@ -96,11 +96,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.src_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = ws_src; - ts_dst.layout = dst; + TensorND ts_src{args.src_tensor->raw_ptr(), src}, ts_dst{ws_src, dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); } @@ -111,11 +107,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = ws_filter; - ts_dst.layout = dst; + TensorND ts_src{args.filter_tensor->raw_ptr(), src}, ts_dst{ws_filter, dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); } @@ -142,11 +134,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.z_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = ws_z; - ts_dst.layout = dst; + TensorND ts_src{args.z_tensor->raw_ptr(), src}, ts_dst{ws_z, dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); z_dev_ptr = reinterpret_cast(ws_z); @@ -168,11 +156,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec( src.init_contiguous_stride(); TensorLayout dst = src; dst.stride[0] = 1, dst.stride[1] = dst[0]; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = ws_dst; - ts_src.layout = src; - ts_dst.raw_ptr = args.dst_tensor->raw_ptr; - ts_dst.layout = dst; + TensorND ts_src{ws_dst, src}, ts_dst{args.dst_tensor->raw_ptr(), dst}; auto&& transpose = args.opr->handle()->create_operator(); transpose->exec(ts_src, ts_dst); } diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp index 3efa1c05..ba166189 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp @@ -114,7 +114,7 @@ SmallVector ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { - void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); reorder_filter(args, m_algo_param.access_size, filter_ptr); } @@ -189,15 +189,15 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec( void* z_ptr = nullptr; if (args.preprocessed_filter) { - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); reorder_filter(args, m_algo_param.access_size, filter_ptr); } - bias_ptr = args.bias_tensor->raw_ptr; + bias_ptr = args.bias_tensor->raw_ptr(); if (args.z_layout->ndim > 0) - z_ptr = args.z_tensor->raw_ptr; + z_ptr = args.z_tensor->raw_ptr(); // \note these constants of cutlass epilogue will be passed to method // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, @@ -233,8 +233,8 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec( use_conv_filter_unity_opt, without_shared_load); execute_cutlass_conv_op( - op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, z_ptr, - args.dst_tensor->raw_ptr, nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, + op, args.src_tensor->raw_ptr(), filter_ptr, bias_ptr, z_ptr, + args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, &threshold, &dst_scale, stream); @@ -272,7 +272,7 @@ void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter( cutlass_wrapper::reorder_nhwc_imma_filter<8>( reinterpret_cast(reordered_filter), - reinterpret_cast(args.filter_tensor->raw_ptr), co, ci, fh, fw, + reinterpret_cast(args.filter_tensor->raw_ptr()), co, ci, fh, fw, trans_oc, alignbits, oc_iterleaved, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp index a8c68d44..fb5f0608 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp @@ -52,8 +52,8 @@ SmallVector ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGe void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr); void* reduce_workspace = reinterpret_cast( args.workspace.raw_ptr + args.bias_layout->span().dist_byte()); @@ -67,8 +67,8 @@ std::tuple ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGem void* bias_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); return {filter_ptr, bias_ptr}; } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); @@ -130,7 +130,7 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::update_bias( int src_zero_point = args.src_tensor->layout.dtype.param().zero_point; do_dispatch_reduce_filter_and_update_bias_4bit( - reinterpret_cast(args.filter_tensor->raw_ptr), + reinterpret_cast(args.filter_tensor->raw_ptr()), args.bias_tensor->compatible_ptr(), co, ci * fh * fw / 8, reinterpret_cast(updated_bias), reinterpret_cast(reduce_workspace), src_zero_point, stream); diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp index f6276d80..9ef73cca 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp @@ -52,8 +52,8 @@ SmallVector ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr); void* reduce_workspace = reinterpret_cast( args.workspace.raw_ptr + args.bias_layout->span().dist_byte()); @@ -67,8 +67,8 @@ std::tuple ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm: void* bias_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); return {filter_ptr, bias_ptr}; } else { filter_ptr = reinterpret_cast(args.workspace.raw_ptr); @@ -146,7 +146,7 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::update_bias( int src_zero_point = args.src_tensor->layout.dtype.param().zero_point; do_dispatch_reduce_filter_and_update_bias_4bit( - reinterpret_cast(args.filter_tensor->raw_ptr), + reinterpret_cast(args.filter_tensor->raw_ptr()), args.bias_tensor->compatible_ptr(), co, ci * fh * fw / 8, reinterpret_cast(updated_bias), reinterpret_cast(reduce_workspace), src_zero_point, stream); diff --git a/dnn/src/cuda/conv_bias/inplace_matmul.cpp b/dnn/src/cuda/conv_bias/inplace_matmul.cpp index daf1d835..d6317078 100644 --- a/dnn/src/cuda/conv_bias/inplace_matmul.cpp +++ b/dnn/src/cuda/conv_bias/inplace_matmul.cpp @@ -40,9 +40,9 @@ size_t ConvBiasForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoInplaceMatmul::exec(const ExecArgs& args) const { WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}}; - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor = TensorND{bundle.get(0), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, diff --git a/dnn/src/cuda/conv_bias/matmul.cpp b/dnn/src/cuda/conv_bias/matmul.cpp index 7b96a917..ae0061af 100644 --- a/dnn/src/cuda/conv_bias/matmul.cpp +++ b/dnn/src/cuda/conv_bias/matmul.cpp @@ -115,9 +115,10 @@ size_t ConvBiasForwardImpl::AlgoMatmul::get_workspace_in_bytes( void ConvBiasForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const { auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor = TensorND{ + bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -168,7 +169,7 @@ void ConvBiasForwardImpl::AlgoMatmul::exec_internal( C(dst_t, config.first[2]); size_t matmul_ws_idx = 2; if (fm.should_flip) { - conv_bias::flip_filter(args, bundle.get_workspace(2), A.raw_ptr); + conv_bias::flip_filter(args, bundle.get_workspace(2), A.get_ref_ptr()); matmul_ws_idx = 3; } diff --git a/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp b/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp index 950a3829..0300f2d6 100644 --- a/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp +++ b/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp @@ -128,12 +128,10 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args) auto bundle = get_bundle(args); bundle.set(args.workspace.raw_ptr); - TensorND src_tensor, dst_tensor, filter_tensor; - if (format == Param::Format::NHWC) { - src_tensor = *args.src_tensor; - dst_tensor = *args.dst_tensor; - filter_tensor = *args.filter_tensor; - } else { + TensorND src_tensor = *args.src_tensor; + TensorND dst_tensor = *args.dst_tensor; + TensorND filter_tensor = *args.filter_tensor; + if (format == Param::Format::NCHW4) { // NCHW4 auto to_nhwc = [](const TensorLayout& layout, void* raw_ptr) -> TensorND { return {raw_ptr, @@ -147,7 +145,7 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args) auto N = src.layout[0], C = src.layout[1] * 4, H = src.layout[2], W = src.layout[3]; args.handle->relayout_opr()->exec( - {src.raw_ptr, + {src.raw_ptr(), TensorLayout{ {N, H, W, C / 4, 4}, {src.layout.stride[0], src.layout.stride[2], @@ -156,8 +154,8 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args) src.layout.dtype}}, {dst_ptr, TensorLayout{{N, H, W, C / 4, 4}, src.layout.dtype}}); }; - relayout(*args.src_tensor, src_tensor.raw_ptr); - relayout(*args.filter_tensor, filter_tensor.raw_ptr); + relayout(*args.src_tensor, src_tensor.raw_ptr()); + relayout(*args.filter_tensor, filter_tensor.raw_ptr()); } size_t N, IH, IW, IC; @@ -193,7 +191,7 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args) // copy (OC, FH*FW*IC) to (OC, FH*FW*IC) with stride=LD inp1 = static_cast(bundle.get(1)); cuda_check(cudaMemcpy2DAsync( - inp1, LD * sizeof(int8_t), filter_tensor.raw_ptr, + inp1, LD * sizeof(int8_t), filter_tensor.raw_ptr(), FH * FW * IC * sizeof(int8_t), FH * FW * IC * sizeof(int8_t), OC, cudaMemcpyDeviceToDevice, stream)); inp1_stride = LD; @@ -222,12 +220,13 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal(const ExecArgs& args) void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec(const ExecArgs& args) const { ExecArgs conv_args = args; - auto conv_dst_tensor = *args.dst_tensor; + TensorND conv_dst_tensor = *args.dst_tensor; if (args.filter_meta.format == Param::Format::NHWC) { auto bundle = get_bundle(args); bundle.set(args.workspace.raw_ptr); if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor = TensorND{ + bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, @@ -239,7 +238,8 @@ void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec(const ExecArgs& args) const { auto bundle = get_bundle(args); bundle.set(args.workspace.raw_ptr); if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor = TensorND{ + bundle.get(bundle.nr_workspace() - 1), args.dst_tensor->layout}; conv_dst_tensor.layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp index d7de5259..25e81b59 100644 --- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp @@ -131,26 +131,26 @@ void ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::exec(const ExecArgs& args) const auto&& stream = cuda_stream(handle); // zp filter do_dispatch_reduce_with_scale_filter_4bit( - static_cast(args.filter_tensor->raw_ptr), -zp_data, OC, + static_cast(args.filter_tensor->raw_ptr()), -zp_data, OC, FH * FW * IC / 8, ws_zp_filter.ptr(), stream); // zp data do_dispatch_reduce_with_scale_data_u4( - ws_zp_data.ptr(), static_cast(args.src_tensor->raw_ptr), - N, IH, IW, OH, OW, PH, PW, FH, FW, SH, SW, IC, -zp_filter, - static_cast(zp_data), stream); + ws_zp_data.ptr(), + static_cast(args.src_tensor->raw_ptr()), N, IH, IW, OH, OW, PH, + PW, FH, FW, SH, SW, IC, -zp_filter, static_cast(zp_data), stream); // do conv if (use_kernel_fhxfw(args)) { wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_fhxfw( - static_cast(args.src_tensor->raw_ptr), - static_cast(args.filter_tensor->raw_ptr), + static_cast(args.src_tensor->raw_ptr()), + static_cast(args.filter_tensor->raw_ptr()), args.dst_tensor->compatible_ptr(), N, IH, IW, OH, OW, PH, PW, IC, OC, FH, FW, SH, SW, static_cast(zp_data), stream); } else { auto&& ws_relayout_filter = ws_bundle.get_workspace(2); wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_1xfw( - static_cast(args.src_tensor->raw_ptr), - static_cast(args.filter_tensor->raw_ptr), + static_cast(args.src_tensor->raw_ptr()), + static_cast(args.filter_tensor->raw_ptr()), args.dst_tensor->compatible_ptr(), ws_relayout_filter.ptr(), N, IH, IW, OH, OW, PH, PW, IC, OC, FH, FW, SH, SW, static_cast(zp_data), stream); diff --git a/dnn/src/cuda/convolution/backward_data/chanwise.cpp b/dnn/src/cuda/convolution/backward_data/chanwise.cpp index d156f605..78b8e508 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise.cpp @@ -60,9 +60,9 @@ void ConvolutionBackwardDataImpl::AlgoChanwise::exec(const ExecArgs& args) const #if CUDA_VERSION >= 9000 if (is_compute_capability_required(5, 3)) { return chanwise::run_bwd_data( - static_cast<__half*>(args.grad_tensor->raw_ptr), - static_cast<__half*>(args.diff_tensor->raw_ptr), - static_cast<__half*>(args.filter_tensor->raw_ptr), kparam, + static_cast<__half*>(args.grad_tensor->raw_ptr()), + static_cast<__half*>(args.diff_tensor->raw_ptr()), + static_cast<__half*>(args.filter_tensor->raw_ptr()), kparam, stream); } else { return chanwise::run_bwd_data( diff --git a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp index 37b2f9eb..4439cc38 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp @@ -68,9 +68,9 @@ void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) #if CUDA_VERSION >= 9000 case DTypeEnum::Float16: return chanwise::run_bwd_data_small( - static_cast(args.grad_tensor->raw_ptr), - static_cast(args.diff_tensor->raw_ptr), - static_cast(args.filter_tensor->raw_ptr), kparam, stream); + static_cast(args.grad_tensor->raw_ptr()), + static_cast(args.diff_tensor->raw_ptr()), + static_cast(args.filter_tensor->raw_ptr()), kparam, stream); #endif default: break; diff --git a/dnn/src/cuda/convolution/backward_data/cudnn.cpp b/dnn/src/cuda/convolution/backward_data/cudnn.cpp index 48099d04..f994d968 100644 --- a/dnn/src/cuda/convolution/backward_data/cudnn.cpp +++ b/dnn/src/cuda/convolution/backward_data/cudnn.cpp @@ -71,9 +71,10 @@ void ConvolutionBackwardDataImpl::AlgoCUDNN::exec(const ExecArgs& args) const { float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionBackwardData( args.handle->cudnn_handle(), &alpha, D.filter_desc.desc, - args.filter_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr, - D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size, - &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr); + args.filter_tensor->raw_ptr(), D.diff_desc.desc, + args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum, + args.workspace.raw_ptr, args.workspace.size, &beta, D.grad_desc.desc, + args.grad_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/cuda/convolution/backward_data/group_conv.cpp b/dnn/src/cuda/convolution/backward_data/group_conv.cpp index b57b0de1..df7d61bd 100644 --- a/dnn/src/cuda/convolution/backward_data/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_data/group_conv.cpp @@ -103,9 +103,9 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); { auto config = prepare_sub_opr(args); - TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]}; - TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; - TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]}; size_t c_pos = 1; @@ -121,9 +121,9 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( auto grp = args.filter_meta.group; for (uint32_t g = 0; g < grp; ++g) { config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0)); - incr_voidp(tfilter.raw_ptr, strd_flt); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + incr_refp(tfilter.get_ref_ptr(), strd_flt); + incr_refp(tdiff.get_ref_ptr(), strd_diff); + incr_refp(tgrad.get_ref_ptr(), strd_grad); } } } diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp index 02ad9b15..1349ae32 100644 --- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp @@ -140,7 +140,8 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( auto&& relayout = args.opr->handle()->create_operator(); relayout->exec( - {args.filter_tensor->raw_ptr, exec_src}, {inner_filter_ptr, exec_dst}); + {args.filter_tensor->raw_ptr(), exec_src}, + {inner_filter_ptr, exec_dst}); } { inner_diff_ptr = reinterpret_cast(bundle.get(1)); @@ -152,7 +153,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( auto&& relayout = args.opr->handle()->create_operator(); relayout->exec( - {args.diff_tensor->raw_ptr, exec_src}, {inner_diff_ptr, exec_dst}); + {args.diff_tensor->raw_ptr(), exec_src}, {inner_diff_ptr, exec_dst}); } int8_t* inner_grad_ptr = reinterpret_cast(bundle.get(2)); @@ -196,7 +197,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( auto&& relayout = args.opr->handle()->create_operator(); relayout->exec( - {inner_grad_ptr, exec_src}, {args.grad_tensor->raw_ptr, exec_dst}); + {inner_grad_ptr, exec_src}, {args.grad_tensor->raw_ptr(), exec_dst}); } } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/matmul.cpp b/dnn/src/cuda/convolution/backward_data/matmul.cpp index a328ca0b..03a9619f 100644 --- a/dnn/src/cuda/convolution/backward_data/matmul.cpp +++ b/dnn/src/cuda/convolution/backward_data/matmul.cpp @@ -143,7 +143,7 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(const ExecArgs& args TensorND A(args.filter_tensor->ptr(), Al), B(col, Bl), C(diff_t, Cl); if (fm.should_flip) { convolution::flip_filter( - args.as_fwd_args(), wbundle.get_workspace(2), A.raw_ptr); + args.as_fwd_args(), wbundle.get_workspace(2), A.get_ref_ptr()); config.second->exec(A, C, B, wbundle.get_workspace(3)); } else { config.second->exec(A, C, B, wbundle.get_workspace(2)); diff --git a/dnn/src/cuda/convolution/backward_filter/chanwise.cpp b/dnn/src/cuda/convolution/backward_filter/chanwise.cpp index b18e44d5..c8d8e27e 100644 --- a/dnn/src/cuda/convolution/backward_filter/chanwise.cpp +++ b/dnn/src/cuda/convolution/backward_filter/chanwise.cpp @@ -50,9 +50,9 @@ void ConvolutionBackwardFilterImpl::AlgoChanwise::exec(const ExecArgs& args) con #if CUDA_VERSION >= 9000 if (is_compute_capability_required(5, 3)) { return chanwise::run_bwd_filter( - static_cast<__half*>(args.grad_tensor->raw_ptr), - static_cast<__half*>(args.src_tensor->raw_ptr), - static_cast<__half*>(args.diff_tensor->raw_ptr), kparam, + static_cast<__half*>(args.grad_tensor->raw_ptr()), + static_cast<__half*>(args.src_tensor->raw_ptr()), + static_cast<__half*>(args.diff_tensor->raw_ptr()), kparam, stream); } else { return chanwise::run_bwd_filter( diff --git a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp index 14731fce..bb9821e4 100644 --- a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp +++ b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp @@ -71,9 +71,9 @@ void ConvolutionBackwardFilterImpl::AlgoCUDNN::exec(const ExecArgs& args) const float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionBackwardFilter( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr, + args.src_tensor->raw_ptr(), D.diff_desc.desc, args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size, - &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr); + &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/cuda/convolution/backward_filter/group_conv.cpp b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp index 4b3ef295..5d497910 100644 --- a/dnn/src/cuda/convolution/backward_filter/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp @@ -101,9 +101,9 @@ void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( { auto config = prepare_sub_opr(args); - TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; - TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; - TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]}; size_t c_pos = 1; @@ -118,9 +118,9 @@ void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( auto grp = fm.group; for (uint32_t g = 0; g < grp; ++g) { config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0)); - incr_voidp(tsrc.raw_ptr, strd_src); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + incr_refp(tsrc.get_ref_ptr(), strd_src); + incr_refp(tdiff.get_ref_ptr(), strd_diff); + incr_refp(tgrad.get_ref_ptr(), strd_grad); } } } diff --git a/dnn/src/cuda/convolution/backward_filter/matmul.cpp b/dnn/src/cuda/convolution/backward_filter/matmul.cpp index 529d61e5..dd56fae8 100644 --- a/dnn/src/cuda/convolution/backward_filter/matmul.cpp +++ b/dnn/src/cuda/convolution/backward_filter/matmul.cpp @@ -133,7 +133,7 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(const ExecArgs& ar froml.stride[0] = args.diff_layout->stride[0]; tol.stride[0] = 1; tol.stride[1] = N; - TensorND from(args.diff_tensor->ptr(), froml), to(diff_t, tol); + TensorND from(args.diff_tensor->raw_ptr(), froml), to(diff_t, tol); args.handle->relayout_opr()->exec(from, to); } { @@ -149,13 +149,13 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(const ExecArgs& ar Cl({OC, OH * OW * N}, typename DTypeTrait::dtype()); TensorND A(args.grad_tensor->ptr(), Al), B(col, Bl), C(diff_t, Cl); if (fm.should_flip) { - A.raw_ptr = wbundle.get(2); + A.reset_ptr(wbundle.get(2)); config.second->exec(C, B, A, wbundle.get_workspace(3)); convolution::flip_filter( args.as_fwd_args(), - {static_cast(args.grad_tensor->raw_ptr), + {static_cast(args.grad_tensor->raw_ptr()), wbundle.get_size(2)}, - A.raw_ptr); + A.get_ref_ptr()); } else { config.second->exec(C, B, A, wbundle.get_workspace(2)); } diff --git a/dnn/src/cuda/convolution/helper.cpp b/dnn/src/cuda/convolution/helper.cpp index 20e2ffc3..dfab0eea 100644 --- a/dnn/src/cuda/convolution/helper.cpp +++ b/dnn/src/cuda/convolution/helper.cpp @@ -68,19 +68,19 @@ SmallVector convolution::matmul_get_workspace_bundle( } void convolution::flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) { + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) { auto&& fm = args.filter_meta; megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; auto dtype = fm.dtype; megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); - TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, + TensorND src{{{OC, IC, FH, FW}, dtype}, ref_ptr}, dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; dst.layout.stride[2] = -dst.layout.stride[2]; dst.layout.stride[3] = -dst.layout.stride[3]; args.handle->relayout_opr()->exec(src, dst); - raw_ptr = workspace.raw_ptr; + ref_ptr.reset(workspace.raw_ptr); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/helper.h b/dnn/src/cuda/convolution/helper.h index 2036f38a..95f51b2c 100644 --- a/dnn/src/cuda/convolution/helper.h +++ b/dnn/src/cuda/convolution/helper.h @@ -85,7 +85,7 @@ struct CUDNNBwdFilterDescs { * change \p raw_ptr to workspace. */ void flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr); + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& raw_ptr); } // namespace convolution } // namespace cuda diff --git a/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp b/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp index e7a8813e..d54f2163 100644 --- a/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp +++ b/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp @@ -55,9 +55,10 @@ void Convolution3DBackwardDataImpl::AlgoCUDNN::exec(const ExecArgs& args) const float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionBackwardData( args.handle->cudnn_handle(), &alpha, D.filter_desc.desc, - args.filter_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr, - D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size, - &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr); + args.filter_tensor->raw_ptr(), D.diff_desc.desc, + args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum, + args.workspace.raw_ptr, args.workspace.size, &beta, D.grad_desc.desc, + args.grad_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp b/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp index ebaaf590..16aa6724 100644 --- a/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp +++ b/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp @@ -96,9 +96,9 @@ void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::exec( auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); { auto config = prepare_sub_opr(args); - TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]}; - TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; - TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]}; size_t c_pos = 1; auto grp = args.filter_meta.group; @@ -114,9 +114,9 @@ void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::exec( for (uint32_t g = 0; g < grp; ++g) { config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0)); - incr_voidp(tfilter.raw_ptr, strd_flt); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + incr_refp(tfilter.get_ref_ptr(), strd_flt); + incr_refp(tdiff.get_ref_ptr(), strd_diff); + incr_refp(tgrad.get_ref_ptr(), strd_grad); } } } diff --git a/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp b/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp index 8fa38c96..2745970d 100644 --- a/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp +++ b/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp @@ -56,9 +56,9 @@ void Convolution3DBackwardFilterImpl::AlgoCUDNN::exec(const ExecArgs& args) cons float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionBackwardFilter( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.diff_desc.desc, args.diff_tensor->raw_ptr, + args.src_tensor->raw_ptr(), D.diff_desc.desc, args.diff_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size, - &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr); + &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv bwd_data failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp b/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp index b67b5533..0ffd0911 100644 --- a/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp +++ b/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp @@ -98,9 +98,9 @@ void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::exec( auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); { auto config = prepare_sub_opr(args); - TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; - TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; - TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr(), config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr(), config.first[2]}; size_t c_pos = 1; auto grp = args.grad_filter_meta.group; @@ -116,9 +116,9 @@ void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::exec( for (uint32_t g = 0; g < grp; ++g) { config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0)); - incr_voidp(tsrc.raw_ptr, strd_src); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + incr_refp(tsrc.get_ref_ptr(), strd_src); + incr_refp(tdiff.get_ref_ptr(), strd_diff); + incr_refp(tgrad.get_ref_ptr(), strd_grad); } } } diff --git a/dnn/src/cuda/convolution3d/forward/1x1x1.cpp b/dnn/src/cuda/convolution3d/forward/1x1x1.cpp index a5cb327b..83fd0a8e 100644 --- a/dnn/src/cuda/convolution3d/forward/1x1x1.cpp +++ b/dnn/src/cuda/convolution3d/forward/1x1x1.cpp @@ -54,17 +54,17 @@ size_t Convolution3DForwardImpl::Algo1x1x1::get_workspace_in_bytes( void Convolution3DForwardImpl::Algo1x1x1::exec(const ExecArgs& args) const { TensorND A, B, C; extract_matmul_layouts(args, A.layout, B.layout, C.layout); - A.raw_ptr = args.filter_tensor->raw_ptr; - B.raw_ptr = args.src_tensor->raw_ptr; - C.raw_ptr = args.dst_tensor->raw_ptr; + A.reset_ptr(args.filter_tensor->raw_ptr()); + B.reset_ptr(args.src_tensor->raw_ptr()); + C.reset_ptr(args.dst_tensor->raw_ptr()); size_t batch = args.src_layout->shape[0]; auto mm = args.handle->matmul_opr(); auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(), strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size(); for (size_t i = 0; i < batch; ++i) { mm->exec(A, B, C, args.workspace); - incr_voidp(B.raw_ptr, strd_B); - incr_voidp(C.raw_ptr, strd_C); + incr_refp(B.get_ref_ptr(), strd_B); + incr_refp(C.get_ref_ptr(), strd_C); } } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/forward/cudnn.cpp b/dnn/src/cuda/convolution3d/forward/cudnn.cpp index a3ba334a..f91834dc 100644 --- a/dnn/src/cuda/convolution3d/forward/cudnn.cpp +++ b/dnn/src/cuda/convolution3d/forward/cudnn.cpp @@ -53,9 +53,10 @@ void Convolution3DForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { float alpha = 1.0f, beta = 0.0f; auto status = cudnnConvolutionForward( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.filter_desc.desc, args.filter_tensor->raw_ptr, - D.conv_desc.desc, m_cudnn_enum, args.workspace.raw_ptr, args.workspace.size, - &beta, D.dst_desc.desc, args.dst_tensor->raw_ptr); + args.src_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.desc, m_cudnn_enum, + args.workspace.raw_ptr, args.workspace.size, &beta, D.dst_desc.desc, + args.dst_tensor->raw_ptr()); megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/cuda/convolution3d/forward/group_conv.cpp b/dnn/src/cuda/convolution3d/forward/group_conv.cpp index 6a85e326..3f63bd95 100644 --- a/dnn/src/cuda/convolution3d/forward/group_conv.cpp +++ b/dnn/src/cuda/convolution3d/forward/group_conv.cpp @@ -103,9 +103,9 @@ void Convolution3DForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); { auto config = prepare_sub_opr(args); - TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; - TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]}; - TensorND tdst{args.dst_tensor->raw_ptr, config.first[2]}; + TensorND tsrc{args.src_tensor->raw_ptr(), config.first[0]}; + TensorND tfilter{args.filter_tensor->raw_ptr(), config.first[1]}; + TensorND tdst{args.dst_tensor->raw_ptr(), config.first[2]}; size_t c_pos; if (args.filter_meta.format == Param::Format::NCDHW) { @@ -127,9 +127,9 @@ void Convolution3DForwardImpl::AlgoGroupConvGeneral::exec(const ExecArgs& args) for (uint32_t g = 0; g < grp; ++g) { config.second->exec(tsrc, tfilter, tdst, bundle.get_workspace(0)); - incr_voidp(tsrc.raw_ptr, strd_src); - incr_voidp(tdst.raw_ptr, strd_dst); - incr_voidp(tfilter.raw_ptr, strd_flt); + incr_refp(tsrc.get_ref_ptr(), strd_src); + incr_refp(tdst.get_ref_ptr(), strd_dst); + incr_refp(tfilter.get_ref_ptr(), strd_flt); } } } diff --git a/dnn/src/cuda/convolution3d/helper.cpp b/dnn/src/cuda/convolution3d/helper.cpp index 93e2d4fe..1e78196c 100644 --- a/dnn/src/cuda/convolution3d/helper.cpp +++ b/dnn/src/cuda/convolution3d/helper.cpp @@ -35,20 +35,20 @@ bool convolution3d::is_cudnn_supported(const ForwardSizeArgs& args) { } void convolution3d::flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) { + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) { auto&& fm = args.filter_meta; megdnn_assert(fm.group == 1 && fm.spatial_ndim == 3); auto OC = fm.ocpg, IC = fm.icpg, FD = fm.spatial[0], FH = fm.spatial[1], FW = fm.spatial[2]; auto dtype = DType::from_enum(fm.dtype_enum); megdnn_assert(workspace.size >= dtype.size() * OC * IC * FD * FH * FW); - TensorND src{raw_ptr, {{OC, IC, FD, FH, FW}, dtype}}, + TensorND src{{{OC, IC, FD, FH, FW}, dtype}, ref_ptr}, dst{workspace.raw_ptr + (FD * FH * FW - 1) * dtype.size(), src.layout}; dst.layout.stride[2] = -dst.layout.stride[2]; dst.layout.stride[3] = -dst.layout.stride[3]; dst.layout.stride[4] = -dst.layout.stride[4]; args.handle->relayout_opr()->exec(src, dst); - raw_ptr = workspace.raw_ptr; + ref_ptr.reset(workspace.raw_ptr); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/helper.h b/dnn/src/cuda/convolution3d/helper.h index 4f85f161..a8ca8ebf 100644 --- a/dnn/src/cuda/convolution3d/helper.h +++ b/dnn/src/cuda/convolution3d/helper.h @@ -84,7 +84,7 @@ struct CUDNNBwdFilterDescs { * change \p raw_ptr to workspace. */ void flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr); + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& raw_ptr); inline bool cudnn_get_convolution_fwd_algo_helper( cudnnHandle_t cudnn_handle, const cudnnTensorDescriptor_t x_desc, diff --git a/dnn/src/cuda/convpooling/opr_impl.cpp b/dnn/src/cuda/convpooling/opr_impl.cpp index 1273dcad..229458f4 100644 --- a/dnn/src/cuda/convpooling/opr_impl.cpp +++ b/dnn/src/cuda/convpooling/opr_impl.cpp @@ -169,10 +169,10 @@ void ConvPoolingForwardImpl::exec( nonlineMode = IDENTITY; } - float *src_ptr = static_cast(src.raw_ptr), - *filter_ptr = static_cast(filter.raw_ptr), - *bias_ptr = static_cast(bias.raw_ptr), - *dst_ptr = static_cast(dst.raw_ptr); + float *src_ptr = static_cast(src.raw_ptr()), + *filter_ptr = static_cast(filter.raw_ptr()), + *bias_ptr = static_cast(bias.raw_ptr()), + *dst_ptr = static_cast(dst.raw_ptr()); switch (this->param().method) { case Param::Method::WITH_SHARED_MEM: diff --git a/dnn/src/cuda/cumsum/opr_impl.cpp b/dnn/src/cuda/cumsum/opr_impl.cpp index a98eb458..2a7d64df 100644 --- a/dnn/src/cuda/cumsum/opr_impl.cpp +++ b/dnn/src/cuda/cumsum/opr_impl.cpp @@ -12,7 +12,7 @@ #include "./opr_impl.h" #include "./kern.cuh" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "src/cuda/utils.h" using namespace megdnn; diff --git a/dnn/src/cuda/dct/opr_impl.cpp b/dnn/src/cuda/dct/opr_impl.cpp index 09e141e8..f6753078 100644 --- a/dnn/src/cuda/dct/opr_impl.cpp +++ b/dnn/src/cuda/dct/opr_impl.cpp @@ -58,7 +58,7 @@ void DctChannelSelectForwardImpl::exec( megdnn_assert( param().format == Param::Format::NCHW4, "qint8 only support nchw4"); dct::call_kern_dct( - src.ptr(), (int8_t*)dst.raw_ptr, in, ic, ih, iw, oc, + src.ptr(), (int8_t*)dst.raw_ptr(), in, ic, ih, iw, oc, with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream, error_info, m_error_tracker, dst.layout.dtype.param<::megdnn::dtype::QuantizedS8>().scale); diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp index a8c35fe9..90ad483f 100644 --- a/dnn/src/cuda/elemwise_helper.cpp +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -227,7 +227,7 @@ INST(dt_quint8); template void ParamElemVisitor4bitBase::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { - m_ptr = reinterpret_cast(rv.raw_ptr); + m_ptr = reinterpret_cast(rv.raw_ptr()); ptrdiff_t min_stride = std::numeric_limits::max(); for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; diff --git a/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp index d9d7f8d8..d9250d42 100644 --- a/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp @@ -21,31 +21,31 @@ using namespace megdnn; using namespace cuda; void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) { + const ElemwiseOpParamN<3>& param, const TensorND& dst) { BroadcastChannelInfo binfo0, binfo1; if (is_vector(param[0].layout) && is_broadcasted_channel_like(param[1].layout, binfo0) && is_broadcasted_channel_like(param[2].layout, binfo1) && binfo0 == binfo1) { elemwise_multi_type::fma3_int16x32x32x32_1c1( - param, dst, cuda_stream(this->handle())); + param, dst.ptr(), cuda_stream(this->handle())); return; } megdnn_throw("unsupported fma3 int16x32x32x32 layout"); } void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) { + const ElemwiseOpParamN<3>& param, const TensorND& dst) { Broadcast1xInfo binfo0, binfo1; auto p1 = param[1].ptr(), p2 = param[2].ptr(); auto stream = cuda_stream(this->handle()); if (is_vector(param[0].layout) && is_broadcasted_1x(param[1].layout, binfo0) && is_broadcasted_1x(param[2].layout, binfo1) && binfo0 == binfo1) { switch (param[0].layout.dtype.enumv()) { -#define cb(t) \ - case DTypeTrait::enumv: \ - elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \ - param[0].ptr::ctype>(), p1, p2, dst, binfo0.x, binfo0.y, \ - stream); \ +#define cb(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \ + param[0].ptr::ctype>(), p1, p2, dst.ptr(), \ + binfo0.x, binfo0.y, stream); \ return; MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) #undef cb @@ -58,14 +58,14 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { auto stream = cuda_stream(this->handle()); if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { switch (param[0].layout.dtype.enumv()) { -#define DISPATCH(t) \ - case DTypeTrait::enumv: \ - elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ - DTypeTrait::ctype, dt_int8>(param, dst, stream); \ +#define DISPATCH(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ + DTypeTrait::ctype, dt_int8>(param, dst.ptr(), stream); \ return; DISPATCH(::megdnn::dtype::Int32) DISPATCH(::megdnn::dtype::Int16) @@ -85,7 +85,7 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { auto stream = cuda_stream(this->handle()); BroadcastChannelInfo info; if (is_vector(param[0].layout) && @@ -95,7 +95,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( is_broadcasted_scalar(param[4].layout) && is_broadcasted_scalar(param[5].layout)) { elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11( - param, dst, stream); + param, dst.ptr(), stream); return; } megdnn_throw( @@ -106,7 +106,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { auto stream = cuda_stream(this->handle()); BroadcastChannelInfo info; if (is_vector(param[0].layout) && @@ -116,7 +116,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( is_broadcasted_scalar(param[4].layout) && is_broadcasted_scalar(param[5].layout)) { elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11( - param, dst, stream); + param, dst.ptr(), stream); return; } megdnn_throw( @@ -127,14 +127,14 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { auto stream = cuda_stream(this->handle()); if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { switch (param[0].layout.dtype.enumv()) { -#define DISPATCH(t) \ - case DTypeTrait::enumv: \ - elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ - DTypeTrait::ctype, dt_int16>(param, dst, stream); \ +#define DISPATCH(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ + DTypeTrait::ctype, dt_int16>(param, dst.ptr(), stream); \ return; DISPATCH(::megdnn::dtype::Int32) DISPATCH(::megdnn::dtype::Int16) @@ -227,22 +227,22 @@ IMPL_MODE_DISPATCHER(2, dt_quint4, dt_qint32); #undef _cb_dispatch_mode -#define _cb_dispatch_mode(_m) \ - case param::Elemwise::Mode::_m: \ - do { \ - using KernImpl = ElemwiseKern< \ - megcorePlatformCUDA, param_enumv::Elemwise::Mode::_m, float>; \ - using Op = kern_ops_quantized::QuantizedMultiTypeOp< \ - arity, src_ctype, dst_ctype, KernImpl>; \ - using dst_storage = typename VectTypeTrait::Storage; \ - dst_storage* dst = reinterpret_cast(dst_tensor.raw_ptr); \ - Op op(src_params, dst, dst_param); \ - ElemwiseOpParamN<1> param_dst; \ - param_dst[0] = dst_tensor; \ - param_dst.init_from_given_tensor(); \ - run_elemwise( \ - param, param_dst, stream, op); \ - return; \ +#define _cb_dispatch_mode(_m) \ + case param::Elemwise::Mode::_m: \ + do { \ + using KernImpl = ElemwiseKern< \ + megcorePlatformCUDA, param_enumv::Elemwise::Mode::_m, float>; \ + using Op = kern_ops_quantized::QuantizedMultiTypeOp< \ + arity, src_ctype, dst_ctype, KernImpl>; \ + using dst_storage = typename VectTypeTrait::Storage; \ + dst_storage* dst = reinterpret_cast(dst_tensor.raw_ptr()); \ + Op op(src_params, dst, dst_param); \ + ElemwiseOpParamN<1> param_dst; \ + param_dst[0] = dst_tensor; \ + param_dst.init_from_given_tensor(); \ + run_elemwise( \ + param, param_dst, stream, op); \ + return; \ } while (0); #define FOREACH(cb) \ diff --git a/dnn/src/cuda/elemwise_multi_type/opr_impl.h b/dnn/src/cuda/elemwise_multi_type/opr_impl.h index 2d711c77..4ce88bde 100644 --- a/dnn/src/cuda/elemwise_multi_type/opr_impl.h +++ b/dnn/src/cuda/elemwise_multi_type/opr_impl.h @@ -18,22 +18,22 @@ namespace cuda { class ElemwiseMultiTypeImpl final : public ElemwiseMultiTypeImplHelper { void on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_quantized_mode( const ElemwiseOpParamN<1>& param, const TensorND& dst, diff --git a/dnn/src/cuda/group_local/forward/opr_impl.cpp b/dnn/src/cuda/group_local/forward/opr_impl.cpp index 5776e82f..44cc7092 100644 --- a/dnn/src/cuda/group_local/forward/opr_impl.cpp +++ b/dnn/src/cuda/group_local/forward/opr_impl.cpp @@ -32,11 +32,6 @@ std::unique_ptr get_opr(Handle* handle, param::Convolution param) return std::move(opr); } -template -void incr_ptr(T*& dst, ptrdiff_t delta) { - dst = reinterpret_cast(reinterpret_cast(dst) + delta); -} - TensorLayout prepare_src_dst(const TensorLayout& input, size_t g) { TensorLayout ret = input; megdnn_assert(ret[1] % g == 0); @@ -84,18 +79,20 @@ void GroupLocalForwardImpl::exec( SH, SW, stream); } else { auto&& opr = get_opr(handle, param()); - TensorND src_g = {src.raw_ptr, prepare_src_dst(src.layout, G)}; - TensorND dst_g = {dst.raw_ptr, prepare_src_dst(dst.layout, G)}; - TensorND filter_g = {filter.raw_ptr, prepare_filter(filter.layout)}; + TensorND src_g = {src.raw_ptr(), prepare_src_dst(src.layout, G)}; + TensorND dst_g = {dst.raw_ptr(), prepare_src_dst(dst.layout, G)}; + TensorND filter_g = {filter.raw_ptr(), prepare_filter(filter.layout)}; for (size_t g = 0; g < G; ++g) { opr->exec(src_g, filter_g, dst_g, workspace); - incr_ptr( - src_g.raw_ptr, src_g.layout.stride[1] * src_g.layout.shape[1] * - src_g.layout.dtype.size()); - incr_ptr( - dst_g.raw_ptr, dst_g.layout.stride[1] * dst_g.layout.shape[1] * - dst_g.layout.dtype.size()); - incr_ptr(filter_g.raw_ptr, filter_g.layout.span().dist_byte()); + incr_refp( + src_g.get_ref_ptr(), src_g.layout.stride[1] * + src_g.layout.shape[1] * + src_g.layout.dtype.size()); + incr_refp( + dst_g.get_ref_ptr(), dst_g.layout.stride[1] * + dst_g.layout.shape[1] * + dst_g.layout.dtype.size()); + incr_refp(filter_g.get_ref_ptr(), filter_g.layout.span().dist_byte()); } } } diff --git a/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp b/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp index 155a9aa2..0424cc7d 100644 --- a/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp +++ b/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp @@ -106,7 +106,7 @@ void LocalShareBackwardDataImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) c B1.stride[4] = wo; B1.stride[5] = 1; B1.stride[6] = co * ho * wo; - TensorND ts_B1{args.diff_tensor->raw_ptr, B1}; + TensorND ts_B1{args.diff_tensor->raw_ptr(), B1}; TensorLayout B2{ {groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, dtype::Float32()}; B2.init_contiguous_stride(); @@ -122,7 +122,7 @@ void LocalShareBackwardDataImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) c TensorLayout C{ {groups * sgh * sgw, icpg * fh * fw, ho / sgh * wo / sgw * n}, dtype::Float32()}; - TensorND ts_A{args.filter_tensor->raw_ptr, A}; + TensorND ts_A{args.filter_tensor->raw_ptr(), A}; TensorND ts_B{ws_pretranspose, B}; TensorND ts_C{ws_col2im, C}; Workspace ws_wrapper; diff --git a/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp b/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp index 5ad6153a..ef1fcce1 100644 --- a/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp +++ b/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp @@ -113,7 +113,7 @@ void LocalShareBackwardFilterImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) B1.stride[4] = co * ho * wo; B1.stride[5] = wo; B1.stride[6] = 1; - TensorND ts_B1{args.diff_tensor->raw_ptr, B1}; + TensorND ts_B1{args.diff_tensor->raw_ptr(), B1}; TensorLayout B2{ {groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, dtype::Float32()}; B2.init_contiguous_stride(); @@ -133,7 +133,7 @@ void LocalShareBackwardFilterImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) TensorLayout C{{groups * sgh * sgw, icpg * fh * fw, ocpg}, dtype::Float32()}; TensorND ts_A{ws_im2col, A}; TensorND ts_B{ws_pretranspose, B}; - TensorND ts_C{args.grad_tensor->raw_ptr, C}; + TensorND ts_C{args.grad_tensor->raw_ptr(), C}; Workspace ws_wrapper; ws_wrapper.raw_ptr = reinterpret_cast(ws_matmul); ws_wrapper.size = ws.get_size(2); diff --git a/dnn/src/cuda/local_share/forward/batched_matmul.cpp b/dnn/src/cuda/local_share/forward/batched_matmul.cpp index d0eaf061..ad1535a2 100644 --- a/dnn/src/cuda/local_share/forward/batched_matmul.cpp +++ b/dnn/src/cuda/local_share/forward/batched_matmul.cpp @@ -100,7 +100,7 @@ void LocalShareForwardImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) const TensorLayout C{ {groups * sgh * sgw, ho / sgh * wo / sgw * n, ocpg}, dtype::Float32()}; TensorND ts_A{ws_im2col, A}; - TensorND ts_B{args.filter_tensor->raw_ptr, B}; + TensorND ts_B{args.filter_tensor->raw_ptr(), B}; TensorND ts_C{ws_posttranspose, C}; Workspace ws_wrapper; ws_wrapper.raw_ptr = reinterpret_cast(ws_matmul); @@ -119,7 +119,7 @@ void LocalShareForwardImpl::AlgoBatchedMatMul::exec(const ExecArgs& args) const C1.stride[6] = ocpg; TensorLayout C2 = args.dst_layout; TensorND ts_C1{ws_posttranspose, C1}; - TensorND ts_C2{args.dst_tensor->raw_ptr, C2}; + TensorND ts_C2{args.dst_tensor->raw_ptr(), C2}; auto&& relayout_opr = args.opr->handle()->create_operator(); relayout_opr->exec(ts_C1, ts_C2); } diff --git a/dnn/src/cuda/lrn/opr_impl.cpp b/dnn/src/cuda/lrn/opr_impl.cpp index ffc37c23..e1aca07f 100644 --- a/dnn/src/cuda/lrn/opr_impl.cpp +++ b/dnn/src/cuda/lrn/opr_impl.cpp @@ -29,7 +29,7 @@ void LRNForwardImpl::exec( float alpha = 1.0f, beta = 0.0f; cudnn_check(cudnnLRNCrossChannelForward( handle, lrn_desc.desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, src_desc.desc, - src.raw_ptr, &beta, dst_desc.desc, dst.raw_ptr)); + src.raw_ptr(), &beta, dst_desc.desc, dst.raw_ptr())); } void LRNBackwardImpl::setup_descs( @@ -51,8 +51,8 @@ void LRNBackwardImpl::exec( float alpha = 1.0f, beta = 0.0f; cudnn_check(cudnnLRNCrossChannelBackward( handle, lrn_desc.desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dst_desc.desc, - dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, - &beta, grad_desc.desc, grad.raw_ptr)); + dst.raw_ptr(), diff_desc.desc, diff.raw_ptr(), src_desc.desc, src.raw_ptr(), + &beta, grad_desc.desc, grad.raw_ptr())); } } // namespace cuda diff --git a/dnn/src/cuda/matrix_inverse/opr_impl.cpp b/dnn/src/cuda/matrix_inverse/opr_impl.cpp index fe775708..31185417 100644 --- a/dnn/src/cuda/matrix_inverse/opr_impl.cpp +++ b/dnn/src/cuda/matrix_inverse/opr_impl.cpp @@ -37,11 +37,11 @@ void MatrixInverseImpl::exec( auto stream = handle->stream(); batched_matrix_mul::arange( reinterpret_cast(psrc_batch), - reinterpret_cast(src.raw_ptr), n * n * sizeof(float), batch, + reinterpret_cast(src.raw_ptr()), n * n * sizeof(float), batch, stream); batched_matrix_mul::arange( reinterpret_cast(pdst_batch), - reinterpret_cast(dst.raw_ptr), n * n * sizeof(float), batch, + reinterpret_cast(dst.raw_ptr()), n * n * sizeof(float), batch, stream); cublas_check(cublasSmatinvBatched( handle->cublas_handle(), n, psrc_batch, n, pdst_batch, n, info, batch)); diff --git a/dnn/src/cuda/matrix_mul/conv1x1.cpp b/dnn/src/cuda/matrix_mul/conv1x1.cpp index 3ef07a73..ef15192a 100644 --- a/dnn/src/cuda/matrix_mul/conv1x1.cpp +++ b/dnn/src/cuda/matrix_mul/conv1x1.cpp @@ -137,7 +137,7 @@ void MatrixMulForwardImpl::AlgoConv1X1CUDNN::exec(const ExecArgs& args) const { {ori_tensor.layout.shape[1], ori_tensor.layout.shape[0]}, ori_tensor.layout.dtype); dst_tensor = TensorND(bundle.get(workspace_pos), dst_layout); - TensorND src_tensor(ori_tensor.raw_ptr, dst_layout); + TensorND src_tensor(ori_tensor.raw_ptr(), dst_layout); src_tensor.layout.stride[0] = ori_tensor.layout.stride[1]; src_tensor.layout.stride[1] = ori_tensor.layout.stride[0]; @@ -156,11 +156,11 @@ void MatrixMulForwardImpl::AlgoConv1X1CUDNN::exec(const ExecArgs& args) const { TensorLayout filter_layout({m, k, 1, 1}, args.layout_a.dtype); TensorLayout dst_layout({1, m, 1, n}, args.layout_c.dtype); - TensorND src(B_dst_tensor.raw_ptr, src_layout); - TensorND filter(A_dst_tensor.raw_ptr, filter_layout); + TensorND src(B_dst_tensor.raw_ptr(), src_layout); + TensorND filter(A_dst_tensor.raw_ptr(), filter_layout); TensorND z(nullptr, TensorLayout(src_layout.dtype)); TensorND bias(nullptr, TensorLayout(src_layout.dtype)); - TensorND dst(args.tensor_c.raw_ptr, dst_layout); + TensorND dst(args.tensor_c.raw_ptr(), dst_layout); ConvBiasForwardImpl::AlgoBase::ExecArgs conv_exec_args( static_cast(conv_opr_ptr.get()), src, filter, bias, z, diff --git a/dnn/src/cuda/matrix_mul/cublas.cpp b/dnn/src/cuda/matrix_mul/cublas.cpp index bf689ea0..6f35ae2f 100644 --- a/dnn/src/cuda/matrix_mul/cublas.cpp +++ b/dnn/src/cuda/matrix_mul/cublas.cpp @@ -77,10 +77,10 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { auto sgemm_ex_err = cublasSgemmEx( cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, - args.tensor_b.raw_ptr, SE_CUDA_DATA_HALF, - args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, + args.tensor_b.raw_ptr(), SE_CUDA_DATA_HALF, + args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr(), SE_CUDA_DATA_HALF, args.tensor_a.layout.stride[0], zero, - args.tensor_c.raw_ptr, SE_CUDA_DATA_HALF, + args.tensor_c.raw_ptr(), SE_CUDA_DATA_HALF, args.tensor_c.layout.stride[0]); cublas_check(sgemm_ex_err); #if CUDART_VERSION >= 9000 @@ -97,11 +97,11 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { auto hgemm_ex_err = cublasHgemm( cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one_half, - static_cast(args.tensor_b.raw_ptr), + static_cast(args.tensor_b.raw_ptr()), args.tensor_b.layout.stride[0], - static_cast(args.tensor_a.raw_ptr), + static_cast(args.tensor_a.raw_ptr()), args.tensor_a.layout.stride[0], zero_half, - static_cast<__half*>(args.tensor_c.raw_ptr), + static_cast<__half*>(args.tensor_c.raw_ptr()), args.tensor_c.layout.stride[0]); cublas_check(hgemm_ex_err); #if CUDART_VERSION >= 9000 @@ -115,10 +115,10 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { cublas_check(cublasGemmEx( cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, - args.tensor_b.raw_ptr, CUDA_R_8I, args.tensor_b.layout.stride[0], - args.tensor_a.raw_ptr, CUDA_R_8I, args.tensor_a.layout.stride[0], zero, - args.tensor_c.raw_ptr, CUDA_R_32I, args.tensor_c.layout.stride[0], - CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); + args.tensor_b.raw_ptr(), CUDA_R_8I, args.tensor_b.layout.stride[0], + args.tensor_a.raw_ptr(), CUDA_R_8I, args.tensor_a.layout.stride[0], + zero, args.tensor_c.raw_ptr(), CUDA_R_32I, + args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); }; // Note that cublas takes column-major matrices as inputs, diff --git a/dnn/src/cuda/matrix_mul/cublas_lt.cpp b/dnn/src/cuda/matrix_mul/cublas_lt.cpp index e90e071b..80260da8 100644 --- a/dnn/src/cuda/matrix_mul/cublas_lt.cpp +++ b/dnn/src/cuda/matrix_mul/cublas_lt.cpp @@ -67,10 +67,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { "workspace bundle size should be 1(ws_algo)"); cublas_check(cublasLtMatmul( cublasLt_handle, desc.matmul_desc, one_half, - static_cast(args.tensor_b.raw_ptr), desc.layout_b, - static_cast(args.tensor_a.raw_ptr), desc.layout_a, - zero_half, static_cast(args.tensor_c.raw_ptr), - desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr), + static_cast(args.tensor_b.raw_ptr()), desc.layout_b, + static_cast(args.tensor_a.raw_ptr()), desc.layout_a, + zero_half, static_cast(args.tensor_c.raw_ptr()), + desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr()), desc.layout_c, &algo, ws_bundle.get(0), ws_bundle.get_size(0), stream)); }; auto igemm = [&]() { @@ -90,14 +90,14 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &pm, sizeof(pm))); cublas_check(cublasLtMatrixTransform( - cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr, + cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr(), desc.layout_b, zero, nullptr, nullptr, ws_b, desc.layout_trans_b, stream)); cublas_check(cublasLtMatrixTransformDescSetAttribute( transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_a, sizeof(trans_a))); cublas_check(cublasLtMatrixTransform( - cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr, + cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr(), desc.layout_a, zero, nullptr, nullptr, ws_a, desc.layout_trans_a, stream)); cublas_check(cublasLtMatmul( @@ -110,7 +110,7 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { sizeof(trans_c))); cublas_check(cublasLtMatrixTransform( cublasLt_handle, transform_desc, one, ws_c, desc.layout_trans_c, zero, - nullptr, nullptr, args.tensor_c.raw_ptr, desc.layout_c, stream)); + nullptr, nullptr, args.tensor_c.raw_ptr(), desc.layout_c, stream)); cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); }; #if CUDA_VERSION >= 11000 diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp index ca054be8..9afc2d08 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp @@ -133,10 +133,10 @@ void MatrixMulForwardImpl::AlgoFloat16TensorOp::do_exec(const ExecArgs& args) co GemmArguments gemm_args{ problem_size, - args.tensor_a.raw_ptr, - args.tensor_b.raw_ptr, - args.tensor_c.raw_ptr, - args.tensor_c.raw_ptr, + args.tensor_a.raw_ptr(), + args.tensor_b.raw_ptr(), + args.tensor_c.raw_ptr(), + args.tensor_c.raw_ptr(), lda, ldb, ldc, diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp index cd44216b..61050696 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp @@ -144,10 +144,10 @@ void MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::do_exec( GemmArguments gemm_args{ problem_size, - args.tensor_a.raw_ptr, - args.tensor_b.raw_ptr, - args.tensor_c.raw_ptr, - args.tensor_c.raw_ptr, + args.tensor_a.raw_ptr(), + args.tensor_b.raw_ptr(), + args.tensor_c.raw_ptr(), + args.tensor_c.raw_ptr(), lda, ldb, ldc, diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp index f9ce2d0d..7c03396c 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp @@ -97,10 +97,10 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(const ExecArgs& args) const GemmArguments gemm_args{ problem_size, - args.tensor_a.raw_ptr, - args.tensor_b.raw_ptr, - args.tensor_c.raw_ptr, - args.tensor_c.raw_ptr, + args.tensor_a.raw_ptr(), + args.tensor_b.raw_ptr(), + args.tensor_c.raw_ptr(), + args.tensor_c.raw_ptr(), lda, ldb, ldc, diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp index 74cfc465..7783e65a 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -103,10 +103,10 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(const ExecArgs& args) GemmArguments gemm_args{ problem_size, - args.tensor_a.raw_ptr, - args.tensor_b.raw_ptr, - args.tensor_c.raw_ptr, - args.tensor_c.raw_ptr, + args.tensor_a.raw_ptr(), + args.tensor_b.raw_ptr(), + args.tensor_c.raw_ptr(), + args.tensor_c.raw_ptr(), lda, ldb, ldc, diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp index 30d11960..541144a7 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp @@ -51,15 +51,12 @@ void MatrixMulForwardImpl::AlgoCutlassMatrixMulBase::exec(const ExecArgs& args) if (!aligned.first) return do_exec(args); const auto& layouts = aligned.second; - auto tensor_a = args.tensor_a; - auto tensor_b = args.tensor_b; - auto workspace = args.workspace; size_t copy_size = 0; for (const auto& ly : layouts) copy_size += ly.span().dist_byte(); auto&& param = args.opr->param(); auto&& stream = cuda_stream(args.opr->handle()); - + auto workspace = args.workspace; cuda_check(cudaMemsetAsync(workspace.raw_ptr, 0, copy_size, stream)); auto&& relayout = args.opr->handle()->create_operator(); @@ -69,14 +66,15 @@ void MatrixMulForwardImpl::AlgoCutlassMatrixMulBase::exec(const ExecArgs& args) if (trans) std::swap(dst.stride[0], dst.stride[1]); }; + + TensorND tensor_a{workspace.raw_ptr, args.tensor_a.layout}; copy_stride(layouts[0], tensor_a.layout, param.transposeA); - tensor_a.raw_ptr = workspace.raw_ptr; relayout->exec(args.tensor_a, tensor_a); workspace.raw_ptr += layouts[0].span().dist_byte(); workspace.size -= layouts[0].span().dist_byte(); + TensorND tensor_b{workspace.raw_ptr, args.tensor_b.layout}; copy_stride(layouts[1], tensor_b.layout, param.transposeB); - tensor_b.raw_ptr = workspace.raw_ptr; relayout->exec(args.tensor_b, tensor_b); workspace.raw_ptr += layouts[1].span().dist_byte(); workspace.size -= layouts[1].span().dist_byte(); diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp index 5ed4ed1b..89703bfc 100644 --- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp @@ -29,13 +29,13 @@ void megdnn::cuda::matrix_mul::exec_wmma_matrix_mul_quint4_nt( int32_t zA = A.layout.dtype.param().zero_point, zB = B.layout.dtype.param().zero_point; exec_reduce_sum_with_scale_uint4( - static_cast(A.raw_ptr), -zB, M, K, ldA / 2, + static_cast(A.raw_ptr()), -zB, M, K, ldA / 2, workspace.ptr(), stream); exec_reduce_sum_with_scale_uint4( - static_cast(B.raw_ptr), -zA, N, K, ldB / 2, + static_cast(B.raw_ptr()), -zA, N, K, ldB / 2, workspace.ptr() + M, stream); exec_wmma_gemm_u4( - static_cast(A.raw_ptr), static_cast(B.raw_ptr), + static_cast(A.raw_ptr()), static_cast(B.raw_ptr()), C.compatible_ptr(), M, N, K, ldA, ldB, ldC, stream); exec_span_qsum( workspace.ptr(), M, workspace.ptr() + M, N, diff --git a/dnn/src/cuda/padding/padding.cu b/dnn/src/cuda/padding/padding.cu index d1a761b4..4bd91b78 100644 --- a/dnn/src/cuda/padding/padding.cu +++ b/dnn/src/cuda/padding/padding.cu @@ -223,7 +223,7 @@ void padding_backward_proxy( params.offsets[i * 2 + 1] = offsets[i * 2 + 1]; } - cudaMemset(dst.raw_ptr, 0, dst.layout.access_bytes()); + cudaMemset(dst.raw_ptr(), 0, dst.layout.access_bytes()); void (*bwd_kern)(const size_t, const size_t, const T* const, T* const, ShapeParams); diff --git a/dnn/src/cuda/param_pack/opr_impl.cpp b/dnn/src/cuda/param_pack/opr_impl.cpp index 8bb72851..e787055e 100644 --- a/dnn/src/cuda/param_pack/opr_impl.cpp +++ b/dnn/src/cuda/param_pack/opr_impl.cpp @@ -28,7 +28,7 @@ void ParamPackConcatImpl::exec_internal( size_t inp_size = srcs.layout.shape[0], out_size = dst.layout.total_nr_elems(); auto stream = cuda_stream(this->handle()); - auto src_cpu = static_cast(srcs.raw_ptr); + auto src_cpu = static_cast(srcs.raw_ptr()); megdnn_assert_internal(src_cpu); auto src_gpu = reinterpret_cast(workspace.raw_ptr); diff --git a/dnn/src/cuda/pooling/algo.cpp b/dnn/src/cuda/pooling/algo.cpp index a689f12e..e04360e5 100644 --- a/dnn/src/cuda/pooling/algo.cpp +++ b/dnn/src/cuda/pooling/algo.cpp @@ -149,7 +149,7 @@ void PoolingForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { args.opr->param().stride_w)); cudnn_check(cudnnPoolingForward( args.handle->cudnn_handle(), cudnn_desc, &alpha, src_desc.desc, - src.raw_ptr, &beta, dst_desc.desc, dst.raw_ptr)); + src.raw_ptr(), &beta, dst_desc.desc, dst.raw_ptr())); cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); } if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { @@ -218,7 +218,7 @@ void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::exec(const ExecArgs& args) c args.opr->param().stride_w)); cudnn_check(cudnnPoolingForward( args.handle->cudnn_handle(), cudnn_desc, &alpha, src_desc.desc, - src.raw_ptr, &beta, dst_desc.desc, dst.raw_ptr)); + src.raw_ptr(), &beta, dst_desc.desc, dst.raw_ptr())); cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); } if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { @@ -341,7 +341,7 @@ void PoolingForwardImpl::AlgoNHWC::exec(const ExecArgs& args) const { } auto&& stream = cuda_stream(args.handle); pooling2d::do_pooling2d_int4_nhwc( - (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, + (int8_t*)src.raw_ptr(), (int8_t*)dst.raw_ptr(), kern_param, stream, static_cast(args.opr->param().mode), uint_case, zero_point); } } @@ -411,8 +411,8 @@ void PoolingForwardImpl::AlgoNCHW64::exec(const ExecArgs& args) const { get_inner_layout( *args.layout_src, *args.layout_dst, src.layout, dst.layout, handle_ptr, args.opr->param().format); - src.raw_ptr = wsb.get(0); - dst.raw_ptr = wsb.get(1); + src = TensorND{wsb.get(0), src.layout}; + dst = TensorND{wsb.get(1), dst.layout}; auto relayout_opr = handle_ptr->create_operator(); RelayoutFormat::Param trans_param; trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; @@ -441,7 +441,7 @@ void PoolingForwardImpl::AlgoNCHW64::exec(const ExecArgs& args) const { } auto&& stream = cuda_stream(args.handle); pooling2d::do_pooling2d_int4_ncdiv64hw64( - (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, + (int8_t*)src.raw_ptr(), (int8_t*)dst.raw_ptr(), kern_param, stream, static_cast(args.opr->param().mode), uint_case, zero_point); } if (args.layout_dst->ndim == 4) { @@ -600,8 +600,8 @@ void PoolingBackwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { args.opr->param().stride_w)); cudnn_check(cudnnPoolingBackward( args.handle->cudnn_handle(), cudnn_desc, &alpha, dst_desc.desc, - dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, - &beta, grad_desc.desc, grad.raw_ptr)); + dst.raw_ptr(), diff_desc.desc, diff.raw_ptr(), src_desc.desc, + src.raw_ptr(), &beta, grad_desc.desc, grad.raw_ptr())); cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); } if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { diff --git a/dnn/src/cuda/reduce/opr_impl.cpp b/dnn/src/cuda/reduce/opr_impl.cpp index 410b29e7..786ca01b 100644 --- a/dnn/src/cuda/reduce/opr_impl.cpp +++ b/dnn/src/cuda/reduce/opr_impl.cpp @@ -15,7 +15,7 @@ #include "src/cuda/handle.h" #include "src/cuda/utils.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" namespace { @@ -114,10 +114,10 @@ namespace cuda { void ReduceForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { - using namespace reduce; + using namespace device_reduce; check_exec(src.layout, dst.layout, workspace.size); size_t A, B, C; - get_ABC(src.layout, A, B, C, param().axis); + reduce::get_ABC(src.layout, A, B, C, param().axis); auto stream = cuda_stream(this->handle()); #define CASE(_mode, _op) \ case _mode: \ @@ -142,9 +142,9 @@ size_t ReduceForwardImpl::get_workspace_in_bytes( megdnn_assert( param().data_type != Reduce::DataType::FLOAT_IO16xC32, "FLOAT_IO16xC32 is deprecated"); - using namespace reduce; + using namespace device_reduce; size_t A, B, C; - get_ABC(src, A, B, C, param().axis); + reduce::get_ABC(src, A, B, C, param().axis); #define CASE(_mode, _op) \ case _mode: { \ return dispatch_dtype_workspace<_op>(src, dst, A, B, C, param().data_type); \ diff --git a/dnn/src/cuda/reduce/reduce.cu b/dnn/src/cuda/reduce/reduce.cu index 87d7a8c1..d6784ee7 100644 --- a/dnn/src/cuda/reduce/reduce.cu +++ b/dnn/src/cuda/reduce/reduce.cu @@ -8,7 +8,7 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "megdnn/dtype.h" #include "src/cuda/reduce_helper.cuh" @@ -16,7 +16,7 @@ namespace megdnn { namespace cuda { -using namespace reduce; +using namespace device_reduce; #define COMMA , diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp index b31c73a0..7249c5e2 100644 --- a/dnn/src/cuda/relayout/opr_impl.cpp +++ b/dnn/src/cuda/relayout/opr_impl.cpp @@ -52,14 +52,14 @@ bool RelayoutForwardImpl::Param::try_transpose() { size_t batch = transp.batch, m = transp.m, n = transp.n; size_t lda = n, ldb = m, stride_A = m * n, stride_B = m * n; auto&& stream = m_opr->stream(); -#define RUN(_dt) \ - do { \ - typedef DTypeTrait::ctype ctype; \ - copy_by_transpose( \ - reinterpret_cast(m_src.raw_ptr), \ - reinterpret_cast(m_dst.raw_ptr), batch, m, n, lda, ldb, \ - stride_A, stride_B, stream); \ - return true; \ +#define RUN(_dt) \ + do { \ + typedef DTypeTrait::ctype ctype; \ + copy_by_transpose( \ + reinterpret_cast(m_src.raw_ptr()), \ + reinterpret_cast(m_dst.raw_ptr()), batch, m, n, lda, ldb, \ + stride_A, stride_B, stream); \ + return true; \ } while (0) switch (dsize) { case 1: @@ -81,7 +81,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() { size_t copy_size = ldst.span().dist_byte(); cuda_check(cudaMemcpyAsync( - m_dst.raw_ptr, m_src.raw_ptr, copy_size, cudaMemcpyDeviceToDevice, + m_dst.raw_ptr(), m_src.raw_ptr(), copy_size, cudaMemcpyDeviceToDevice, m_opr->stream())); return true; } @@ -137,7 +137,7 @@ bool RelayoutForwardImpl::Param::try_copy_2d(bool cross_dev) { auto dsize = dtype_size(); cuda_check(cudaMemcpy2DAsync( - m_dst.raw_ptr, ldst.stride[0] * dsize, m_src.raw_ptr, + m_dst.raw_ptr(), ldst.stride[0] * dsize, m_src.raw_ptr(), lsrc.stride[0] * dsize, ldst.shape[1] * dsize, ldst.shape[0], cudaMemcpyDeviceToDevice, m_opr->stream())); diff --git a/dnn/src/cuda/relayout/param_visitor.cpp b/dnn/src/cuda/relayout/param_visitor.cpp index 00966050..1be519e7 100644 --- a/dnn/src/cuda/relayout/param_visitor.cpp +++ b/dnn/src/cuda/relayout/param_visitor.cpp @@ -71,7 +71,7 @@ template void ParamElemVisitor::host_init( const TensorND& rv, int /*grid_size*/, int /*block_size*/) { megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); - m_ptr = reinterpret_cast(rv.raw_ptr); + m_ptr = reinterpret_cast(rv.raw_ptr()); ptrdiff_t min_stride = std::numeric_limits::max(); for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; diff --git a/dnn/src/cuda/relayout_format/opr_impl.cpp b/dnn/src/cuda/relayout_format/opr_impl.cpp index a5030f03..b48d6c24 100644 --- a/dnn/src/cuda/relayout_format/opr_impl.cpp +++ b/dnn/src/cuda/relayout_format/opr_impl.cpp @@ -47,27 +47,27 @@ void RelayoutFormatImpl::exec( megdnn_assert(param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4); row = src.layout[0] * src.layout[1] * src.layout[2], col = src.layout[3]; } - TensorND trans_in, trans_out; - trans_in.raw_ptr = src.raw_ptr; - trans_in.layout = {{row, col}, dtype::Int32()}; - trans_in.layout.init_contiguous_stride(); - trans_out.raw_ptr = dst.raw_ptr; - trans_out.layout = trans_in.layout; - trans_out.layout.stride[0] = 1; - trans_out.layout.stride[1] = row; + TensorLayout layout_in, layout_out; + layout_in = {{row, col}, dtype::Int32()}; + layout_in.init_contiguous_stride(); + layout_out = layout_in; + layout_out.stride[0] = 1; + layout_out.stride[1] = row; + TensorND trans_in{src.raw_ptr(), layout_in}, + trans_out{dst.raw_ptr(), layout_out}; return handle()->create_operator()->exec(trans_in, trans_out); } if ((param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL || param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) && src.layout[1] % 4 != 0) { megdnn_assert( - src.raw_ptr != dst.raw_ptr && src.layout.ndim == 4, + src.raw_ptr() != dst.raw_ptr() && src.layout.ndim == 4, "The mode of NCHW_NCHW4 and NCHW_NCHW4_CONV_DENSE_WEIGHT " "of RelayoutFormat opr(cuda backend) does not support " "src.ptr == dst.ptr"); megdnn_assert(src.layout[1] <= 4); cuda_check(cudaMemsetAsync( - dst.raw_ptr, 0, dst.layout.span().dist_byte(), + dst.raw_ptr(), 0, dst.layout.span().dist_byte(), cuda_stream(this->handle()))); TensorLayout exec_dst_layout = dst.layout; exec_dst_layout[4] = src.layout[1]; @@ -77,7 +77,7 @@ void RelayoutFormatImpl::exec( src.layout[2], src.layout[3]}) .dimshuffle({0, 2, 3, 4, 1}); return handle()->create_operator()->exec( - {src.raw_ptr, exec_src_layout}, {dst.raw_ptr, exec_dst_layout}); + {src.raw_ptr(), exec_src_layout}, {dst.raw_ptr(), exec_dst_layout}); } bool is_trans_4bits = (param().mode == Param::Mode::NCHW_NCHW64 || param().mode == Param::Mode::NCHW64_NCHW || @@ -103,8 +103,8 @@ void RelayoutFormatImpl::exec( // fallback impls TensorLayout exec_src, exec_dst, exec_workspace; deduce_exec_layout(src.layout, dst.layout, exec_workspace, exec_src, exec_dst); - TensorND exec_src_nd{src.raw_ptr, exec_src}; - TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; + TensorND exec_src_nd{src.raw_ptr(), exec_src}; + TensorND exec_dst_nd{dst.raw_ptr(), exec_dst}; handle()->create_operator()->exec(exec_src_nd, exec_dst_nd); } diff --git a/dnn/src/cuda/relayout_format/relayout_format.cu b/dnn/src/cuda/relayout_format/relayout_format.cu index 31655534..1e3410a4 100644 --- a/dnn/src/cuda/relayout_format/relayout_format.cu +++ b/dnn/src/cuda/relayout_format/relayout_format.cu @@ -339,8 +339,8 @@ void relayout_format::relayout_format_cuda_nchw_nchwx( const dim3 block_dim(DIVUP(hw, nr_threads* _pack_w), out_n); \ const dim3 thread_dim(nr_threads); \ return kernel<<>>( \ - (_src_c_type*)src.raw_ptr, (_dst_c_type*)dst.raw_ptr, in_n, ic, hw, \ - n_stride_src, ic_stride, n_stride_dst, oc_stride, \ + (_src_c_type*)src.raw_ptr(), (_dst_c_type*)dst.raw_ptr(), in_n, ic, \ + hw, n_stride_src, ic_stride, n_stride_dst, oc_stride, \ CudaPostProcess( \ src_scale, src_zero_point, dst_scale, dst_zero_point), \ src_zero_point, group, ocpg); \ @@ -407,8 +407,8 @@ void relayout_format::relayout_format_cuda_nchw_nchwx( n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(_dst_c_type)); \ oc_stride = oc_stride * _size_nbits / (8 * sizeof(_dst_c_type)); \ typename RelayoutProblem_::Param param{ \ - SrcIterator_{(InnerDtype_*)src.raw_ptr, ic_stride, ic, w, w_pad}, \ - DstIterator_{(_dst_c_type*)dst.raw_ptr, oc_stride, oc, w, w_pad}, \ + SrcIterator_{(InnerDtype_*)src.raw_ptr(), ic_stride, ic, w, w_pad}, \ + DstIterator_{(_dst_c_type*)dst.raw_ptr(), oc_stride, oc, w, w_pad}, \ CudaPostProcess_{ \ src_scale, src_zero_point, dst_scale, dst_zero_point}, \ n_stride_src, \ @@ -505,8 +505,8 @@ void relayout_format::relayout_format_cuda_nchwx_nchw( n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(InnerDtype_)); \ oc_stride = oc_stride * _size_nbits / (8 * sizeof(InnerDtype_)); \ typename RelayoutProblem_::Param param{ \ - SrcIterator_{(_src_c_type*)src.raw_ptr, ic_stride, ic, w, w_pad}, \ - DstIterator_{(InnerDtype_*)dst.raw_ptr, oc_stride, oc, w, w_pad}, \ + SrcIterator_{(_src_c_type*)src.raw_ptr(), ic_stride, ic, w, w_pad}, \ + DstIterator_{(InnerDtype_*)dst.raw_ptr(), oc_stride, oc, w, w_pad}, \ CudaPostProcess_{ \ src_scale, src_zero_point, dst_scale, dst_zero_point}, \ n_stride_src, \ @@ -554,7 +554,7 @@ void relayout_format::relayout_format_cuda_nchw4_nchw( const dim3 block_dim(DIVUP(hw, nr_threads * pack_w), n); const dim3 thread_dim(nr_threads); kern_nchw4_nchw<<>>( - (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, n, ic, oc, h, w, group); + (int8_t*)src.raw_ptr(), (int8_t*)dst.raw_ptr(), n, ic, oc, h, w, group); after_kernel_launch(); } @@ -581,7 +581,7 @@ void relayout_format::relayout_format_cuda_nchw_nchw4_weight( const dim3 thread_dim(nr_threads); kern_nchw_nchw4_weight<<>>( - (char*)src.raw_ptr, (char*)dst.raw_ptr, oc, ic, hw, oc_stride_src, + (char*)src.raw_ptr(), (char*)dst.raw_ptr(), oc, ic, hw, oc_stride_src, ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0, {}); after_kernel_launch(); } diff --git a/dnn/src/cuda/relayout_format/relayout_format_nchw_nhwc.cu b/dnn/src/cuda/relayout_format/relayout_format_nchw_nhwc.cu index 2b6ff478..e3f600bd 100644 --- a/dnn/src/cuda/relayout_format/relayout_format_nchw_nhwc.cu +++ b/dnn/src/cuda/relayout_format/relayout_format_nchw_nhwc.cu @@ -82,8 +82,8 @@ void relayout_format::relayout_format_cuda_nchw_nhwc( n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(_dst_c_type)); \ hw_stride = hw_stride * _size_nbits / (8 * sizeof(_dst_c_type)); \ typename RelayoutProblem_::Param param{ \ - SrcIterator_{(InnerDtype_*)src.raw_ptr, ic_stride, ic, w, w_pad}, \ - DstIterator_{(_dst_c_type*)dst.raw_ptr, hw_stride, oc, w, w_pad}, \ + SrcIterator_{(InnerDtype_*)src.raw_ptr(), ic_stride, ic, w, w_pad}, \ + DstIterator_{(_dst_c_type*)dst.raw_ptr(), hw_stride, oc, w, w_pad}, \ CudaPostProcess_{ \ src_scale, src_zero_point, dst_scale, dst_zero_point}, \ n_stride_src, \ @@ -167,8 +167,8 @@ void relayout_format::relayout_format_cuda_nhwc_nchw( n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(InnerDtype_)); \ oc_stride = oc_stride * _size_nbits / (8 * sizeof(InnerDtype_)); \ typename RelayoutProblem_::Param param{ \ - SrcIterator_{(_src_c_type*)src.raw_ptr, hw_stride, ic, w, w_pad}, \ - DstIterator_{(InnerDtype_*)dst.raw_ptr, oc_stride, oc, w, w_pad}, \ + SrcIterator_{(_src_c_type*)src.raw_ptr(), hw_stride, ic, w, w_pad}, \ + DstIterator_{(InnerDtype_*)dst.raw_ptr(), oc_stride, oc, w, w_pad}, \ CudaPostProcess_{ \ src_scale, src_zero_point, dst_scale, dst_zero_point}, \ n_stride_src, \ diff --git a/dnn/src/cuda/repeat/opr_impl.cpp b/dnn/src/cuda/repeat/opr_impl.cpp index b6b0834e..e66bdabd 100644 --- a/dnn/src/cuda/repeat/opr_impl.cpp +++ b/dnn/src/cuda/repeat/opr_impl.cpp @@ -69,6 +69,7 @@ void RepeatBackwardImpl::exec_internal( diff_.ptr(), grad_.ptr(), workspace0, workspace1, current, next, state, nr_reduces); + TensorND reduce_src, reduce_dst; for (size_t j = 0; j < ndim; ++j) { size_t i = j + 1; if (times.shape[j] != 1) { @@ -82,11 +83,10 @@ void RepeatBackwardImpl::exec_internal( // forward is repeat (m, n) to (m*times, n) // backward is reduce (m, times, n) to (m, 1, n) m_opr->param().axis = 1; - TensorND reduce_src; - reduce_src.raw_ptr = current; + + reduce_src.reset_ptr(current); reduce_src.layout = TensorLayout(TensorShape{m, times[j], n}, dtype); - TensorND reduce_dst; - reduce_dst.raw_ptr = next; + reduce_dst.reset_ptr(next); reduce_dst.layout = TensorLayout(TensorShape{m, 1u, n}, dtype); m_opr->exec(reduce_src, reduce_dst, Workspace()); update_tile_repeat_state( diff --git a/dnn/src/cuda/roi_align/opr_impl.cpp b/dnn/src/cuda/roi_align/opr_impl.cpp index d14a4fb0..e8223201 100644 --- a/dnn/src/cuda/roi_align/opr_impl.cpp +++ b/dnn/src/cuda/roi_align/opr_impl.cpp @@ -78,7 +78,7 @@ void ROIAlignBackwardImpl::exec( using namespace ::megdnn::roi_align; using namespace ::megdnn::cuda::roi_align; cuda_check(cudaMemsetAsync( - grad.raw_ptr, 0, grad.layout.total_nr_elems() * grad.layout.dtype.size(), + grad.raw_ptr(), 0, grad.layout.total_nr_elems() * grad.layout.dtype.size(), stream)); #define cb(DType) \ if (diff.layout.dtype == DType()) { \ diff --git a/dnn/src/cuda/roi_copy/opr_impl.cpp b/dnn/src/cuda/roi_copy/opr_impl.cpp index 4a7590b3..2e97e1dd 100644 --- a/dnn/src/cuda/roi_copy/opr_impl.cpp +++ b/dnn/src/cuda/roi_copy/opr_impl.cpp @@ -33,7 +33,7 @@ void ROICopyImpl::exec( {N, OH, OW, OC}, {istride0, istride1, istride2, istride3}, src.layout.dtype); TensorND relayout_src( - static_cast(src.raw_ptr) + + static_cast(src.raw_ptr()) + (param().row_from * istride1 + param().col_from * istride2) * src.layout.dtype.size(), relayout_src_layout); diff --git a/dnn/src/cuda/svd/opr_impl.cpp b/dnn/src/cuda/svd/opr_impl.cpp index eb4ee173..7b485750 100644 --- a/dnn/src/cuda/svd/opr_impl.cpp +++ b/dnn/src/cuda/svd/opr_impl.cpp @@ -42,7 +42,7 @@ void transpose( megdnn::cuda::HandleImpl* handle, const TensorND& src, const TensorND& dst) { TensorLayout t = transposed_layout(src.layout); megdnn_assert(t.total_nr_elems() == dst.layout.total_nr_elems()); - handle->relayout_opr()->exec({src.raw_ptr, t}, dst); + handle->relayout_opr()->exec({src.raw_ptr(), t}, dst); } } // namespace @@ -118,8 +118,8 @@ void SVDForwardImpl::exec( wbundle.get_workspace(4).raw_ptr, {transposed_shape(vt_shape), dtype::Float32()}}; } else { - cur_v = {u.raw_ptr, u.layout.reshape(u_shape)}; - cur_u = {vt.raw_ptr, vt.layout.reshape(vt_shape)}; + cur_v = {u.raw_ptr(), u.layout.reshape(u_shape)}; + cur_u = {vt.raw_ptr(), vt.layout.reshape(vt_shape)}; } } else { cur_u = cur_v = {nullptr, {{0, 0}, dtype::Float32()}}; @@ -131,7 +131,7 @@ void SVDForwardImpl::exec( float* cusolver_ws = wbundle.get_workspace(1).ptr(); size_t cusolver_ws_size = wbundle.get_workspace(1).size / sizeof(float); int* info = wbundle.get_workspace(2).ptr(); - TensorND s_blk(s.raw_ptr, s.layout.reshape({block_cnt, min_mn})); + TensorND s_blk(s.raw_ptr(), s.layout.reshape({block_cnt, min_mn})); if (need_transpose) { ::transpose(handle, src, inp_copy); diff --git a/dnn/src/cuda/tile/opr_impl.cpp b/dnn/src/cuda/tile/opr_impl.cpp index b03c3e6c..1525fe92 100644 --- a/dnn/src/cuda/tile/opr_impl.cpp +++ b/dnn/src/cuda/tile/opr_impl.cpp @@ -52,7 +52,7 @@ void TileBackwardImpl::exec_internal( auto dtype = diff_.layout.dtype; if (nr_reduces == 0) { cuda_check(cudaMemcpyAsync( - grad_.raw_ptr, diff_.raw_ptr, sizeof(T) * diff.total_nr_elems(), + grad_.raw_ptr(), diff_.raw_ptr(), sizeof(T) * diff.total_nr_elems(), cudaMemcpyDeviceToDevice, stream)); } else { auto ndim = times.ndim; @@ -68,7 +68,7 @@ void TileBackwardImpl::exec_internal( init_tile_repeat_state( diff_.ptr(), grad_.ptr(), workspace0, workspace1, current, next, state, nr_reduces); - + TensorND reduce_src, reduce_dst; for (size_t j = 0; j < ndim; ++j) { size_t i = j + 1; if (times.shape[j] != 1) { @@ -87,11 +87,10 @@ void TileBackwardImpl::exec_internal( TensorND reduce_src(current, TensorShape{m, times[j], n}); TensorND reduce_dst(next, TensorShape{m, 1u, n}); */ - TensorND reduce_src; - reduce_src.raw_ptr = current; + + reduce_src.reset_ptr(current); reduce_src.layout = TensorLayout(TensorShape{m, times[j], n}, dtype); - TensorND reduce_dst; - reduce_dst.raw_ptr = next; + reduce_dst.reset_ptr(next); reduce_dst.layout = TensorLayout(TensorShape{m, 1u, n}, dtype); m_opr->exec(reduce_src, reduce_dst, Workspace()); update_tile_repeat_state( diff --git a/dnn/src/cuda/type_cvt/kern.cu b/dnn/src/cuda/type_cvt/kern.cu index 88afc0df..d786df19 100644 --- a/dnn/src/cuda/type_cvt/kern.cu +++ b/dnn/src/cuda/type_cvt/kern.cu @@ -355,7 +355,7 @@ template void typecvt_kern_n2q( megdnn_assert(DTypeTrait::enumv == dest.layout.dtype.enumv().ev); \ using dst_storage = typename VectTypeTrait::Storage; \ Op op; \ - op.dest = reinterpret_cast(dest.raw_ptr); \ + op.dest = reinterpret_cast(dest.raw_ptr()); \ body; \ run_elemwise(param_src, param_dst, stream, op); \ return; \ diff --git a/dnn/src/cuda/warp_perspective/backward_data.cpp b/dnn/src/cuda/warp_perspective/backward_data.cpp index a59d4f5f..f80e4902 100644 --- a/dnn/src/cuda/warp_perspective/backward_data.cpp +++ b/dnn/src/cuda/warp_perspective/backward_data.cpp @@ -63,7 +63,7 @@ void WarpPerspectiveBackwardDataImpl::exec( IH = grad.layout.shape[2], IW = grad.layout.shape[3], OH = diff.layout.shape[2], OW = diff.layout.shape[3]; int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); N = mat_idx.layout.shape[0]; midx_ptr = mat_idx.ptr(); diff --git a/dnn/src/cuda/warp_perspective/backward_mat.cpp b/dnn/src/cuda/warp_perspective/backward_mat.cpp index 87b6a7e1..83b7973f 100644 --- a/dnn/src/cuda/warp_perspective/backward_mat.cpp +++ b/dnn/src/cuda/warp_perspective/backward_mat.cpp @@ -67,7 +67,7 @@ void WarpPerspectiveBackwardMatImpl::exec( IW = src.layout.shape[3], OH = diff.layout.shape[2], OW = diff.layout.shape[3]; int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); N = mat_idx.layout.shape[0]; midx_ptr = mat_idx.ptr(); diff --git a/dnn/src/cuda/warp_perspective/forward.cpp b/dnn/src/cuda/warp_perspective/forward.cpp index 363e8e3b..f7697648 100644 --- a/dnn/src/cuda/warp_perspective/forward.cpp +++ b/dnn/src/cuda/warp_perspective/forward.cpp @@ -181,8 +181,8 @@ void WarpPerspectiveForwardImpl::exec( get_inner_layout( ssrc.layout, sdst.layout, src.layout, dst.layout, handle_ptr, param().format); - src.raw_ptr = bundle.get(0); - dst.raw_ptr = bundle.get(1); + src = TensorND{bundle.get(0), src.layout}; + dst = TensorND{bundle.get(1), dst.layout}; auto relayout_opr = handle_ptr->create_operator(); RelayoutFormat::Param trans_param; trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; @@ -198,7 +198,7 @@ void WarpPerspectiveForwardImpl::exec( if (is_nhwc && param().imode != Param::InterpolationMode::LINEAR) { // use opencv impl only for nhwc and non-linear interp megdnn_assert( - !mat_idx.raw_ptr, + !mat_idx.raw_ptr(), "mat_idx is not supported in NHWC case with " "non-linear interpolation"); warp_perspective::warp_perspective_cv_exec( @@ -272,7 +272,7 @@ void WarpPerspectiveForwardImpl::exec( if (src.layout.dtype == dtype::Float32{}) { warp_perspective::forward_proxy( is_nhwc, src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval, bmode, async_error_info(handle()), m_error_tracker, stream); @@ -281,7 +281,7 @@ void WarpPerspectiveForwardImpl::exec( #ifndef MEGDNN_DISABLE_FLOAT16 warp_perspective::forward_proxy( is_nhwc, src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, async_error_info(handle()), m_error_tracker, stream); @@ -289,7 +289,7 @@ void WarpPerspectiveForwardImpl::exec( } else if (src.layout.dtype == dtype::Uint8()) { warp_perspective::forward_proxy( is_nhwc, src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval, bmode, async_error_info(handle()), m_error_tracker, stream); @@ -300,7 +300,7 @@ void WarpPerspectiveForwardImpl::exec( "NHWC + Int8"); warp_perspective::forward_proxy( false, src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval /* implicit float -> int8 conversion, should be safe */ @@ -313,7 +313,7 @@ void WarpPerspectiveForwardImpl::exec( "QuantizedS8 only"); warp_perspective::forward_proxy_nchw4( src.compatible_ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.compatible_ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval, bmode, async_error_info(handle()), m_error_tracker, stream); @@ -325,7 +325,7 @@ void WarpPerspectiveForwardImpl::exec( bval = fmin(fmax(-8.f, bval), 7.f); warp_perspective::forward_proxy_nchw64( src.compatible_ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.compatible_ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, @@ -346,7 +346,7 @@ void WarpPerspectiveForwardImpl::exec( bval = fmin(fmax(0, bval), 15); warp_perspective::forward_proxy_nchw64( src.compatible_ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.compatible_ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, @@ -371,7 +371,7 @@ void WarpPerspectiveForwardImpl::exec( if (C % 16 == 0) { warp_perspective::forward_proxy_nhwc_bit4( src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, async_error_info(handle()), m_error_tracker, @@ -379,7 +379,7 @@ void WarpPerspectiveForwardImpl::exec( } else { warp_perspective::forward_proxy_nhwc_bit4( src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, async_error_info(handle()), m_error_tracker, @@ -390,7 +390,7 @@ void WarpPerspectiveForwardImpl::exec( if (C % 16 == 0) { warp_perspective::forward_proxy_nhwc_bit4( src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, async_error_info(handle()), m_error_tracker, @@ -399,7 +399,7 @@ void WarpPerspectiveForwardImpl::exec( warp_perspective::forward_proxy_nhwc_bit4< dt_quint4, pack_c>( src.ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, static_cast(bval), bmode, async_error_info(handle()), m_error_tracker, @@ -433,7 +433,7 @@ void WarpPerspectiveForwardImpl::exec( dt_quint8, dt_uint8, dt_int8>( is_nhwc_ic_small, src.compatible_ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.compatible_ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval, src_dtype_param, bmode, async_error_info(handle()), m_error_tracker, stream); @@ -448,7 +448,7 @@ void WarpPerspectiveForwardImpl::exec( dt_quint8, dt_uint8, dt_float32>( is_nhwc, src.compatible_ptr(), mat.ptr(), - mat_idx.raw_ptr ? mat_idx.ptr() : nullptr, + mat_idx.raw_ptr() ? mat_idx.ptr() : nullptr, dst.compatible_ptr(), src.layout[0], mat.layout[0], C, IH, IW, OH, OW, bval, src_dtype_param, bmode, async_error_info(handle()), m_error_tracker, stream); diff --git a/dnn/src/fallback/add_update/opr_impl.cpp b/dnn/src/fallback/add_update/opr_impl.cpp index 99909c2b..5f80dc1a 100644 --- a/dnn/src/fallback/add_update/opr_impl.cpp +++ b/dnn/src/fallback/add_update/opr_impl.cpp @@ -46,12 +46,12 @@ void AddUpdateImpl::exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) { !dest.layout.eq_shape(delta.layout)) { return naive::AddUpdateForwardImpl::exec(dest, delta); } - -#define cb(DType) \ - if (dest.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(forward(dest, delta, m_param)); \ - return; \ + auto param = m_param; +#define cb(DType) \ + if (dest.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(forward(dest, delta, param)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/fallback/batched_matrix_mul/algos.cpp b/dnn/src/fallback/batched_matrix_mul/algos.cpp index 49f5d03e..ee48683f 100644 --- a/dnn/src/fallback/batched_matrix_mul/algos.cpp +++ b/dnn/src/fallback/batched_matrix_mul/algos.cpp @@ -76,11 +76,11 @@ void BatchedMatrixMulForwardImpl::AlgoDefault::exec(const ExecArgs& args) const auto kern = [args, param]() { auto N = args.layout_a.shape[0]; TensorND A_, B_, C_; - A_.raw_ptr = args.tensor_a.raw_ptr; + A_.reset_ptr(args.tensor_a.raw_ptr()); A_.layout = args.layout_a.remove_axis(0); - B_.raw_ptr = args.tensor_b.raw_ptr; + B_.reset_ptr(args.tensor_b.raw_ptr()); B_.layout = args.layout_b.remove_axis(0); - C_.raw_ptr = args.tensor_c.raw_ptr; + C_.reset_ptr(args.tensor_c.raw_ptr()); C_.layout = args.layout_c.remove_axis(0); auto Astrd = args.layout_a.dtype.size() * args.layout_a.stride[0], @@ -88,7 +88,8 @@ void BatchedMatrixMulForwardImpl::AlgoDefault::exec(const ExecArgs& args) const Cstrd = args.layout_c.dtype.size() * args.layout_c.stride[0]; auto advance_ptr = [](TensorND& dest, ptrdiff_t d) { - dest.raw_ptr = static_cast(static_cast(dest.raw_ptr) + d); + dest.reset_ptr( + static_cast(static_cast(dest.raw_ptr()) + d)); }; auto opr = inplace_cpu_handle()->create_operator(); diff --git a/dnn/src/fallback/conv_bias/algos.cpp b/dnn/src/fallback/conv_bias/algos.cpp index 134f5480..12f6fa06 100644 --- a/dnn/src/fallback/conv_bias/algos.cpp +++ b/dnn/src/fallback/conv_bias/algos.cpp @@ -77,11 +77,15 @@ void kern_default(const ConvBiasImpl::NCBKernParam& p) { auto filter_meta = *filter_meta_ptr; auto layouts = get_layouts(p); - TensorND src{reinterpret_cast(const_cast(p.src_ptr)), layouts[0]}; - TensorND filter{const_cast(p.filter_ptr), layouts[1]}; - auto bias_ptr = reinterpret_cast(const_cast(p.bias_ptr)); + TensorND src{ + reinterpret_cast(const_cast(p.src_ptr.get_ptr())), + layouts[0]}; + TensorND filter{const_cast(p.filter_ptr.get_ptr()), layouts[1]}; + auto bias_ptr = reinterpret_cast(const_cast(p.bias_ptr.get_ptr())); TensorND bias{bias_ptr, layouts[2]}; - TensorND dst{reinterpret_cast(const_cast(p.dst_ptr)), layouts[3]}; + TensorND dst{ + reinterpret_cast(const_cast(p.dst_ptr.get_ptr())), + layouts[3]}; auto sfb = dst; if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { @@ -153,13 +157,13 @@ void kern_default(const ConvBiasImpl::NCBKernParam& p) { auto nonlinear = inplace_cpu_handle()->create_operator(); nonlinear->param().mode = Elemwise::Param::Mode::SIGMOID; nonlinear->exec({res}, res); - if (res.raw_ptr != dst.raw_ptr) { + if (res.raw_ptr() != dst.raw_ptr()) { inplace_cpu_handle()->create_operator()->exec(res, dst); } break; } case NonlineMode::IDENTITY: { - if (res.raw_ptr != dst.raw_ptr) { + if (res.raw_ptr() != dst.raw_ptr()) { inplace_cpu_handle()->create_operator()->exec(res, dst); } break; @@ -224,10 +228,12 @@ SmallVector ConvBiasImpl::AlgoNaive::dispatch_kerns( thread_param.workspace_ptr = reinterpret_cast( reinterpret_cast(param.workspace_ptr) + thread_id * workspace_per_thread); - thread_param.filter_ptr = param.filter(group_id); - thread_param.dst_ptr = param.dst(batch_id, group_id); - thread_param.src_ptr = param.src(batch_id, group_id); - thread_param.bias_ptr = param.bias(batch_id, group_id); + + thread_param.filter_ptr += param.filter_offset(group_id); + thread_param.dst_ptr += param.dst_offset(batch_id, group_id); + thread_param.src_ptr += param.src_offset(batch_id, group_id); + thread_param.bias_ptr += param.bias_offset(batch_id, group_id); + kern_default(thread_param); } MIDOUT_END(); diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h index c6eb03c1..b3a6983f 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h @@ -102,10 +102,11 @@ public: oc_tile_id_in_group * packa_bytes_per_oc_tile; size_t numbers_offset_of_filter = oc_tile_size * IC * oc_tile_id_in_group; - int8_t* tmp_ptr = is_enable_filter_preprocess(param) - ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) - : static_cast(whole_bundle.get(0)); + int8_t* tmp_ptr = + is_enable_filter_preprocess(param) + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr()) + : static_cast(whole_bundle.get(0)); src_ctype* a_panel = reinterpret_cast(tmp_ptr + bytes_offset_of_a_panel); @@ -199,10 +200,11 @@ public: size_t bytes_offset_of_a_panel = group_id * packa_bytes_per_group + oc_tile_id_in_group * packa_bytes_per_oc_tile; - int8_t* tmp_ptr = is_enable_filter_preprocess(param) - ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) - : static_cast(whole_bundle.get(0)); + int8_t* tmp_ptr = + is_enable_filter_preprocess(param) + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr()) + : static_cast(whole_bundle.get(0)); int8_t* a_panel = tmp_ptr + bytes_offset_of_a_panel; diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp index 13ef856c..20397f9e 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp @@ -37,10 +37,10 @@ void Strategy< size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; int8_t* tmp_ptr = sparam.enable_filter_preprocess ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) + param.preprocessed_filter->tensors[0].raw_ptr()) : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); int8_t* a_panel = tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset; - matmul_param.A_ptr = const_cast(param.filter(group_id)); + matmul_param.A_ptr.reset(const_cast(param.filter(group_id))); matmul_algo->pack_A( matmul_param, a_panel, ncb_index.ndrange_id[1], matmul_desc.innerblocksize.m); @@ -162,7 +162,7 @@ void Strategy< int8_t* tmp_ptr = sparam.enable_filter_preprocess ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) + param.preprocessed_filter->tensors[0].raw_ptr()) : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); src_ctype* a_panel = reinterpret_cast(tmp_ptr + a_panel_offset); diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp index 68ae1fd7..c08084e0 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp @@ -87,9 +87,9 @@ void Strategy< matmul_param.N = sparam.output_block_size; matmul_param.LDB = sparam.output_block_size; matmul_param.LDC = sparam.output_block_size; - matmul_param.A_ptr = filter; - matmul_param.B_ptr = im2col_dst; - matmul_param.C_ptr = matmul_dst; + matmul_param.A_ptr.reset(filter); + matmul_param.B_ptr.reset(im2col_dst); + matmul_param.C_ptr.reset(matmul_dst); auto matmul_kern = matmul_algo->get_kern(matmul_param); matmul_kern(matmul_param); } diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp index 9686c558..187a37ce 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp @@ -42,7 +42,7 @@ void Strategy< int8_t* tmp_ptr = sparam.enable_filter_preprocess ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) + param.preprocessed_filter->tensors[0].raw_ptr()) : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); int8_t* a_panel = tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset; @@ -75,7 +75,7 @@ void Strategy< int8_t* tmp_ptr = sparam.enable_filter_preprocess ? static_cast( - param.preprocessed_filter->tensors[0].raw_ptr) + param.preprocessed_filter->tensors[0].raw_ptr()) : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); src_ctype* a_panel = reinterpret_cast(tmp_ptr + a_panel_offset); diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index d5950f0a..f7b9d012 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -402,10 +402,10 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( NCBKernParam ret; static_cast(ret) = make_ncb_kern_size_param( src.layout, filter.layout, bias.layout, dst.layout, preprocessed_filter); - ret.src_ptr = src.raw_ptr; - ret.filter_ptr = filter.raw_ptr; - ret.bias_ptr = bias.raw_ptr; - ret.dst_ptr = dst.raw_ptr; + ret.src_ptr = src.get_ref_ptr(); + ret.filter_ptr = filter.get_ref_ptr(); + ret.bias_ptr = bias.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); ret.workspace_ptr = workspace.raw_ptr; ret.workspace_size = workspace.size; return ret; @@ -543,8 +543,7 @@ const char* ConvBiasImpl::get_algorithm_set_name() const { namespace megdnn { namespace fallback { -template -const T* ConvBiasImpl::NCBKernParam::src( +size_t ConvBiasImpl::NCBKernParam::src_offset( size_t batch_id, size_t group_pack_id, size_t channel_pack_id, size_t group_pack_size, size_t channel_pack_size) const { size_t batch_offset = batch_id * inp_bs * src_type.size(); @@ -552,13 +551,21 @@ const T* ConvBiasImpl::NCBKernParam::src( isz[1] * src_type.size(); size_t channel_offset = channel_pack_size * channel_pack_id * isz[0] * isz[1] * src_type.size(); - return reinterpret_cast( - reinterpret_cast(src_ptr) + batch_offset + group_offset + - channel_offset); + return (batch_offset + group_offset + channel_offset); } template -const T* ConvBiasImpl::NCBKernParam::filter( +const T* ConvBiasImpl::NCBKernParam::src( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id, + size_t group_pack_size, size_t channel_pack_size) const { + return reinterpret_cast( + reinterpret_cast(src_ptr.get_ptr()) + + src_offset( + batch_id, group_pack_id, channel_pack_id, group_pack_size, + channel_pack_size)); +} + +size_t ConvBiasImpl::NCBKernParam::filter_offset( size_t group_pack_id, size_t pack_group_size) const { size_t group_offset = 0_z; switch (filter_meta.format) { @@ -613,11 +620,18 @@ const T* ConvBiasImpl::NCBKernParam::filter( default: megdnn_assert(0, "other filter format is not support yet"); } - return reinterpret_cast(reinterpret_cast(filter_ptr) + group_offset); + return group_offset; } template -const T* ConvBiasImpl::NCBKernParam::bias( +const T* ConvBiasImpl::NCBKernParam::filter( + size_t group_pack_id, size_t pack_group_size) const { + size_t group_offset = filter_offset(group_pack_id, pack_group_size); + return reinterpret_cast( + reinterpret_cast(filter_ptr.get_ptr()) + group_offset); +} + +size_t ConvBiasImpl::NCBKernParam::bias_offset( size_t batch_id, size_t group_pack_id, size_t channel_pack_id, size_t group_pack_size, size_t channel_pack_size) const { size_t batch_offset = 0_z; @@ -634,13 +648,21 @@ const T* ConvBiasImpl::NCBKernParam::bias( group_pack_size * group_pack_id * filter_meta.ocpg * bias_type.size(); channel_offset = channel_pack_size * channel_pack_id * bias_type.size(); } - return reinterpret_cast( - reinterpret_cast(bias_ptr) + batch_offset + group_offset + - channel_offset); + return (batch_offset + group_offset + channel_offset); } template -T* ConvBiasImpl::NCBKernParam::dst( +const T* ConvBiasImpl::NCBKernParam::bias( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id, + size_t group_pack_size, size_t channel_pack_size) const { + return reinterpret_cast( + reinterpret_cast(bias_ptr.get_ptr()) + + bias_offset( + batch_id, group_pack_id, channel_pack_id, group_pack_size, + channel_pack_size)); +} + +size_t ConvBiasImpl::NCBKernParam::dst_offset( size_t batch_id, size_t group_pack_id, size_t channel_pack_id, size_t group_pack_size, size_t channel_pack_size) const { size_t batch_offset = batch_id * out_bs * dst_type.size(); @@ -648,9 +670,18 @@ T* ConvBiasImpl::NCBKernParam::dst( osz[1] * dst_type.size(); size_t channel_offset = channel_pack_size * channel_pack_id * osz[0] * osz[1] * dst_type.size(); + return (batch_offset + group_offset + channel_offset); +} + +template +T* ConvBiasImpl::NCBKernParam::dst( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id, + size_t group_pack_size, size_t channel_pack_size) const { return reinterpret_cast( - reinterpret_cast(dst_ptr) + batch_offset + group_offset + - channel_offset); + reinterpret_cast(dst_ptr.get_ptr()) + + dst_offset( + batch_id, group_pack_id, channel_pack_id, group_pack_size, + channel_pack_size)); } #define INST(T) \ diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index 35f94f0d..cf887d59 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -115,23 +115,35 @@ public: //! memory param for kernels with non-contiguous batch struct NCBKernParam : public NCBKernSizeParam { NCBKernParam() = default; - const void* src_ptr; - const void* filter_ptr; - const void* bias_ptr; - void* dst_ptr; + RefPtr src_ptr; + RefPtr filter_ptr; + RefPtr bias_ptr; + RefPtr dst_ptr; void* workspace_ptr; size_t workspace_size; template const T* src() const { src_type.assert_is_compatible_ctype(); - return static_cast(src_ptr); + return static_cast(src_ptr.get_ptr()); } //! when format is nchwxx, multi channel will pack into one //! chnannel_pack_id. pack_channel_size is the number of packed channel //! when format is nchwxx and channel wise, multi group will pack into //! one group_pack_id. group_pack_size is the number of packed group //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8} + size_t src_offset( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id = 0, + size_t group_pack_size = 1, size_t channel_pack_size = 1) const; + + size_t bias_offset( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id = 0, + size_t group_pack_size = 1, size_t channel_pack_size = 1) const; + + size_t dst_offset( + size_t batch_id, size_t group_pack_id, size_t channel_pack_id = 0, + size_t group_pack_size = 1, size_t channel_pack_size = 1) const; + template const T* src( size_t batch_id, size_t group_pack_id, size_t channel_pack_id = 0, @@ -149,25 +161,27 @@ public: //! when format is nchwxx and channel wise, multi group will pack into //! one group_pack_id. group_pack_size is the number of packed group //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8} + size_t filter_offset(size_t group_pack_id, size_t pack_group_size = 1_z) const; + template const T* filter(size_t group_pack_id, size_t pack_group_size = 1_z) const; template const T* filter() const { filter_type.assert_is_compatible_ctype(); - return static_cast(filter_ptr); + return static_cast(filter_ptr.get_ptr()); } template const T* bias() const { bias_type.assert_is_compatible_ctype(); - return static_cast(bias_ptr); + return static_cast(bias_ptr.get_ptr()); } template T* dst() const { dst_type.assert_is_compatible_ctype(); - return static_cast(dst_ptr); + return static_cast(dst_ptr.get_ptr()); } template diff --git a/dnn/src/fallback/conv_bias/winograd/winograd.h b/dnn/src/fallback/conv_bias/winograd/winograd.h index 257c9643..5f27b55f 100644 --- a/dnn/src/fallback/conv_bias/winograd/winograd.h +++ b/dnn/src/fallback/conv_bias/winograd/winograd.h @@ -291,7 +291,7 @@ public: //! Filter trans dst ptr input_filter_compute_type* filter_transform_buf = reinterpret_cast( - reinterpret_cast(preprocessed_tensor.raw_ptr) + + reinterpret_cast(preprocessed_tensor.raw_ptr()) + group_id * filter_group_size); //! Filter trans src ptr input_filter_compute_type* transform_mid_buf = @@ -367,7 +367,8 @@ public: //! NCHW88_WINOGRAD and NCHW_WINOGRAD is the same offset const input_filter_compute_type* filter_transform_buf = nullptr; if (nullptr != ncb_param.preprocessed_filter) { - auto preprocess_raw_ptr = ncb_param.preprocessed_filter->tensors[0].raw_ptr; + auto preprocess_raw_ptr = + ncb_param.preprocessed_filter->tensors[0].raw_ptr(); filter_transform_buf = reinterpret_cast( reinterpret_cast(preprocess_raw_ptr) + group_id * filter_group_size); @@ -411,30 +412,37 @@ public: rep(i, Strategy::ALPHA) rep(j, Strategy::ALPHA) { if (format == param::MatrixMul::Format::DEFAULT) { - matmul_param.A_ptr = input_transform_buf + - (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC; - matmul_param.B_ptr = filter_transform_buf + - (i * Strategy::ALPHA + j) * OC * IC + oc_start_idx; + matmul_param.A_ptr = RefPtr( + (void*)(input_transform_buf + + (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC)); - matmul_param.C_ptr = output_transform_buf + (i * Strategy::ALPHA + j) * - nr_tiles_in_unit * - nr_oc_in_unit; + matmul_param.B_ptr = RefPtr( + (void*)(filter_transform_buf + + (i * Strategy::ALPHA + j) * OC * IC + oc_start_idx)); + + matmul_param.C_ptr = RefPtr( + (void*)(output_transform_buf + (i * Strategy::ALPHA + j) * + nr_tiles_in_unit * + nr_oc_in_unit)); matmul_param.M = nr_tiles_in_unit; matmul_param.N = nr_oc_in_unit; matmul_param.LDB = OC; matmul_param.LDC = nr_oc_in_unit; } else { - matmul_param.A_ptr = filter_transform_buf + - (i * Strategy::ALPHA + j) * OC * IC + - oc_start_idx * IC; - - matmul_param.B_ptr = input_transform_buf + - (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC; - - matmul_param.C_ptr = output_transform_buf + (i * Strategy::ALPHA + j) * - nr_tiles_in_unit * - nr_oc_in_unit; + matmul_param.A_ptr = RefPtr( + (void*)(filter_transform_buf + + (i * Strategy::ALPHA + j) * OC * IC + + oc_start_idx * IC)); + + matmul_param.B_ptr = RefPtr( + (void*)(input_transform_buf + + (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC)); + + matmul_param.C_ptr = RefPtr( + (void*)(output_transform_buf + (i * Strategy::ALPHA + j) * + nr_tiles_in_unit * + nr_oc_in_unit)); matmul_param.N = nr_tiles_in_unit; matmul_param.M = nr_oc_in_unit; matmul_param.LDB = matmul_param.N * Strategy::IC_BLOCK_SIZE; diff --git a/dnn/src/fallback/convolution/algos.cpp b/dnn/src/fallback/convolution/algos.cpp index 61adae44..d5c22525 100644 --- a/dnn/src/fallback/convolution/algos.cpp +++ b/dnn/src/fallback/convolution/algos.cpp @@ -109,12 +109,13 @@ void kern_matmul(const NCBKernParam& param) { {static_cast(1), static_cast(IC * FH * FW)}, param.filter_type); - A_src.raw_ptr = static_cast(filter); + A_src.reset_ptr(static_cast(filter)); A_dst.layout = TensorLayout({IC * FH * FW, OC}, param.filter_type); - A_dst.raw_ptr = static_cast(bundle.get(2)); + A_dst.reset_ptr(static_cast(bundle.get(2))); // TODO Should be removed once armv8 convolution support transpose. get_relayout_opr()->exec(A_src, A_dst, inplace_cpu_handle().get()); } + TensorND B_, C_; for (size_t n = 0; n < N; ++n) { gtype *C_src, *C_dst; dtype* diff = const_cast(param.diff() + n * param.inp_bs); @@ -125,11 +126,10 @@ void kern_matmul(const NCBKernParam& param) { C_src = static_cast(bundle.get(0)); } { - TensorND B_, C_; B_.layout = TensorLayout({OC, IH * IW}, param.diff_type); - B_.raw_ptr = static_cast(diff); + B_.reset_ptr(static_cast(diff)); C_.layout = TensorLayout({IC * FH * FW, IH * IW}, param.grad_type); - C_.raw_ptr = C_src; + C_.reset_ptr(C_src); Workspace workspace( static_cast(bundle.get(1)), bundle.get_size(1)); get_matmul_opr(param)->exec(A_dst, B_, C_, workspace); @@ -377,22 +377,15 @@ SmallVector ConvolutionImpl::AlgoDefault:: algo->dispatch_preprocess_kerns(conv_bias_param); SmallVector convolution_preprocess_kerns; - //! Set the conv_bias param using convolution param - auto set_param_filter_workspace_ptr = - [](const NCBKernParam& conv_param, - ::ConvBiasImpl::NCBKernParam& conv_bias_param) { - conv_bias_param.filter_ptr = conv_param.filter_ptr; - conv_bias_param.workspace_ptr = conv_param.workspace_ptr; - conv_bias_param.workspace_size = conv_param.workspace_size; - }; for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) { auto kernel = conv_bias_preprocess_kerns[i]; //! If the kerenl batch parallel - auto run = [param = conv_bias_param, kernel, - &set_param_filter_workspace_ptr]( - const NCBKernParam& p, - const NCBKernIndex& ncb_index) mutable { - set_param_filter_workspace_ptr(p, param); + auto run = [conv_bias_param, kernel]( + const NCBKernParam& p, const NCBKernIndex& ncb_index) { + auto param = conv_bias_param; + param.filter_ptr = p.filter_ptr; + param.workspace_ptr = p.workspace_ptr; + param.workspace_size = p.workspace_size; kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id}); }; convolution_preprocess_kerns.push_back({run, kernel.global_size}); @@ -413,24 +406,17 @@ SmallVector ConvolutionImpl::AlgoDefault::get_kimpl( auto&& conv_bias_kerns = algo->dispatch_kerns(conv_bias_param); SmallVector convolution_kerns; - //! Set the conv_bias param using convolution param - auto set_copy_param_compute_address = - [](const NCBKernParam& conv_param, - ::ConvBiasImpl::NCBKernParam& conv_bias_param) { - conv_bias_param.src_ptr = conv_param.src_ptr; - conv_bias_param.filter_ptr = conv_param.filter_ptr; - conv_bias_param.dst_ptr = conv_param.dst_ptr; - conv_bias_param.workspace_ptr = conv_param.workspace_ptr; - conv_bias_param.workspace_size = conv_param.workspace_size; - }; for (size_t i = 0; i < conv_bias_kerns.size(); i++) { auto&& kernel = conv_bias_kerns[i]; //! If the kerenl batch parallel - auto run = [param = conv_bias_param, kernel, - &set_copy_param_compute_address]( - const NCBKernParam& p, - const NCBKernIndex& ncb_index) mutable { - set_copy_param_compute_address(p, param); + auto run = [conv_bias_param, kernel]( + const NCBKernParam& p, const NCBKernIndex& ncb_index) { + auto param = conv_bias_param; + param.src_ptr = p.src_ptr; + param.filter_ptr = p.filter_ptr; + param.dst_ptr = p.dst_ptr; + param.workspace_ptr = p.workspace_ptr; + param.workspace_size = p.workspace_size; kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id}); }; convolution_kerns.push_back({run, kernel.global_size}); diff --git a/dnn/src/fallback/convolution/algos.h b/dnn/src/fallback/convolution/algos.h index 60c9f390..73ea97ff 100644 --- a/dnn/src/fallback/convolution/algos.h +++ b/dnn/src/fallback/convolution/algos.h @@ -33,32 +33,34 @@ void kern_naive_forward( p.filter_type.size(); ptrdiff_t istrd = p.filter_meta.icpg * p.src_type.size(); ptrdiff_t ostrd = p.filter_meta.ocpg * p.dst_type.size(); - TensorND src, dst; - - src.layout.dtype = p.src_type; - dst.layout.dtype = p.dst_type; + TensorLayout src_layout, dst_layout; + src_layout.dtype = p.src_type; + dst_layout.dtype = p.dst_type; if (p.filter_meta.format == param::Convolution::Format::NCHW) { istrd *= p.isz[0] * p.isz[1]; ostrd *= p.osz[0] * p.osz[1]; - src.layout.init_contiguous_stride({1, IC, IH, IW}); - dst.layout.init_contiguous_stride({1, OC, OH, OW}); + src_layout.init_contiguous_stride({1, IC, IH, IW}); + dst_layout.init_contiguous_stride({1, OC, OH, OW}); } else { // Must be NHWC megdnn_assert( p.filter_meta.format == param::Convolution::Format::NHWC, "AlgoNaive only support NCHW and NHWC, not support format %d", static_cast(p.filter_meta.format)); - src.layout.init_contiguous_stride({1, IH, IW, IC}); - dst.layout.init_contiguous_stride({1, OH, OW, OC}); + src_layout.init_contiguous_stride({1, IH, IW, IC}); + dst_layout.init_contiguous_stride({1, OH, OW, OC}); } - src.raw_ptr = reinterpret_cast( - reinterpret_cast(p.src_ptr) + - batch_id * p.inp_bs * p.src_type.size() + group_id * istrd); - dst.raw_ptr = reinterpret_cast( - reinterpret_cast(p.dst_ptr) + - batch_id * p.out_bs * p.dst_type.size() + group_id * ostrd); + + RefPtr src_refp = p.src_ptr; + RefPtr dst_refp = p.dst_ptr; + + src_refp += (batch_id * p.inp_bs * p.src_type.size() + group_id * istrd); + dst_refp += (batch_id * p.out_bs * p.dst_type.size() + group_id * ostrd); + + TensorND src{src_layout, src_refp}, dst{dst_layout, dst_refp}; + ST* filter = reinterpret_cast( - reinterpret_cast(p.filter_ptr) + group_id * fstrd); + reinterpret_cast(p.filter_ptr.get_ptr()) + group_id * fstrd); std::copy(p.inp_s, p.inp_s + 4, src.layout.stride); std::copy(p.out_s, p.out_s + 4, dst.layout.stride); naive::convolution::forward(src, filter, dst, p.filter_meta); @@ -66,9 +68,9 @@ void kern_naive_forward( template void kern_naive(const ConvolutionBackwardDataImpl::NCBKernParam& p) { - TensorND diff(const_cast(p.diff_ptr), p.diff_layout), - filter(const_cast(p.filter_ptr), p.filter_layout), - grad(p.grad_ptr, p.grad_layout); + TensorND diff(const_cast(p.diff_ptr.get_ptr()), p.diff_layout), + filter(const_cast(p.filter_ptr.get_ptr()), p.filter_layout), + grad(p.grad_ptr.get_ptr(), p.grad_layout); naive::convolution::backward_data( filter, diff, grad, p.filter_meta); } diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index cd891818..1e1622db 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -263,9 +263,9 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( NCBKernParam ret; static_cast(ret) = make_ncb_kern_size_param( src.layout, filter.layout, dst.layout, preprocessed_filter); - ret.src_ptr = src.raw_ptr; - ret.filter_ptr = filter.raw_ptr; - ret.dst_ptr = dst.raw_ptr; + ret.src_ptr = src.get_ref_ptr(); + ret.filter_ptr = filter.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); ret.workspace_ptr = workspace.raw_ptr; ret.workspace_size = workspace.size; return ret; @@ -607,9 +607,9 @@ ConvolutionBackwardDataImpl::NCBKernParam ConvolutionBackwardDataImpl:: workspace.size >= required_workspace_in_bytes, "required workspace: %zu; provided workspace: %zu", required_workspace_in_bytes, workspace.size); - ret.filter_ptr = filter.raw_ptr; - ret.diff_ptr = diff.raw_ptr; - ret.grad_ptr = grad.raw_ptr; + ret.filter_ptr = filter.get_ref_ptr(); + ret.diff_ptr = diff.get_ref_ptr(); + ret.grad_ptr = grad.get_ref_ptr(); ret.workspace_ptr = workspace.raw_ptr; ret.workspace_size = workspace.size; return ret; @@ -655,9 +655,9 @@ void ConvolutionBackwardDataImpl::exec_with_ncb_kern(const NCBKernParam& param) } for (size_t i = 0; i < group; ++i) { kptr(p1g); - incr_ptr(p1g.diff_ptr, istrd); - incr_ptr(p1g.filter_ptr, fstrd); - incr_ptr(p1g.grad_ptr, ostrd); + p1g.diff_ptr += istrd; + p1g.filter_ptr += fstrd; + p1g.grad_ptr += ostrd; p1g.diff_extra_mem_size -= istrd; p1g.filter_extra_mem_size -= fstrd; p1g.grad_extra_mem_size -= ostrd; diff --git a/dnn/src/fallback/convolution/opr_impl.h b/dnn/src/fallback/convolution/opr_impl.h index 145c4943..ebe5c548 100644 --- a/dnn/src/fallback/convolution/opr_impl.h +++ b/dnn/src/fallback/convolution/opr_impl.h @@ -115,28 +115,28 @@ public: //! memory param for kernels with non-contiguous batch struct NCBKernParam : public NCBKernSizeParam { - const void* src_ptr; - const void* filter_ptr; - void* dst_ptr; + RefPtr src_ptr; + RefPtr filter_ptr; + RefPtr dst_ptr; void* workspace_ptr; size_t workspace_size; template const T* src() const { src_type.assert_is_compatible_ctype(); - return static_cast(src_ptr); + return static_cast(src_ptr.get_ptr()); } template const T* filter() const { filter_type.assert_is_compatible_ctype(); - return static_cast(filter_ptr); + return static_cast(filter_ptr.get_ptr()); } template T* dst() const { dst_type.assert_is_compatible_ctype(); - return static_cast(dst_ptr); + return static_cast(dst_ptr.get_ptr()); } template @@ -154,7 +154,8 @@ public: size_t group_offset = group_pack_size * group_pack_id * filter_meta.ocpg * osz[0] * osz[1] * dst_type.size(); return reinterpret_cast( - reinterpret_cast(dst_ptr) + batch_offset + group_offset); + reinterpret_cast(dst_ptr.get_ptr()) + batch_offset + + group_offset); } template @@ -165,7 +166,8 @@ public: size_t group_offset = group_pack_size * group_pack_id * filter_meta.icpg * isz[0] * isz[1] * src_type.size(); return reinterpret_cast( - reinterpret_cast(src_ptr) + batch_offset + group_offset); + reinterpret_cast(src_ptr.get_ptr()) + batch_offset + + group_offset); } template @@ -174,7 +176,7 @@ public: filter_meta.ocpg * filter_meta.spatial[0] * filter_meta.spatial[1] * filter_type.size(); return reinterpret_cast( - reinterpret_cast(filter_ptr) + group_offset); + reinterpret_cast(filter_ptr.get_ptr()) + group_offset); } }; @@ -356,28 +358,28 @@ public: //! memory param for kernels with non-contiguous batch struct NCBKernParam : public NCBKernSizeParam { - const void* filter_ptr; - const void* diff_ptr; - void* grad_ptr; + RefPtr filter_ptr; + RefPtr diff_ptr; + RefPtr grad_ptr; void* workspace_ptr; size_t workspace_size; template const T* diff() const { diff_type.assert_is_compatible_ctype(); - return static_cast(diff_ptr); + return static_cast(diff_ptr.get_ptr()); } template const T* filter() const { filter_type.assert_is_compatible_ctype(); - return static_cast(filter_ptr); + return static_cast(filter_ptr.get_ptr()); } template T* grad() const { grad_type.assert_is_compatible_ctype(); - return static_cast(grad_ptr); + return static_cast(grad_ptr.get_ptr()); } template diff --git a/dnn/src/fallback/elemwise/opr_impl.cpp b/dnn/src/fallback/elemwise/opr_impl.cpp index 5832e57e..a9fd7815 100644 --- a/dnn/src/fallback/elemwise/opr_impl.cpp +++ b/dnn/src/fallback/elemwise/opr_impl.cpp @@ -31,17 +31,18 @@ void ElemwiseImpl::unary_kern(const ElemwiseOpParamN<1>& param) { using ctype = typename DTypeTrait::ctype; using Kern = ElemwiseKern; MIDOUT_BEGIN(megdnn_fallback_elemwise_unary, ctype, midout_iv(mode)) { - ctype* __restrict src = param[0].ptr(); - ctype* __restrict dst = m_dst->ptr(); - // only specialize for the most common 1-dim case + auto tot = param.size; + auto stride = param[0].layout.stride[0]; + auto src0 = param[0]; + auto dst_tensor = *m_dst; if (param.max_ndim == 1) { MIDOUT_BEGIN( megdnn_fallback_elemwise_unary, ctype, midout_iv(mode), midout_iv(1)) { - auto tot = param.size; - auto stride = param[0].layout.stride[0]; MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict src = static_cast(src0.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); for (size_t i = 0; i < tot; ++i) { dst[i] = Kern::apply(src[i * stride]); } @@ -61,17 +62,20 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { using Kern = ElemwiseKern; MIDOUT_BEGIN(megdnn_fallback_elemwise_binary, ctype, midout_iv(mode)) { - ctype* __restrict a = param[0].ptr(); - ctype* __restrict b = param[1].ptr(); - ctype* __restrict dst = m_dst->ptr(); - if (param.max_ndim == 1) { MIDOUT_BEGIN( megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), midout_iv(1)) { auto tot = param.size; auto as = param[0].layout.stride[0], bs = param[1].layout.stride[0]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); for (size_t i = 0; i < tot; ++i) { dst[i] = Kern::apply(a[i * as], b[i * bs]); } @@ -94,7 +98,15 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { bs0 = param[1].layout.stride[0], bs1 = param[1].layout.stride[1]; auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); ptrdiff_t toff = 0; for (size_t i = 0; i < n0; ++i) { for (size_t j = 0; j < n1; ++j) { @@ -116,8 +128,14 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { auto bs = param[1].layout.stride[0], as0 = param[0].layout.stride[0], as1 = param[0].layout.stride[1]; auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); ptrdiff_t toff = 0; for (size_t i = 0; i < n0; ++i) { for (size_t j = 0; j < n1; ++j) { @@ -143,7 +161,15 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { auto as = param[0].layout.stride[0], bs = param[1].layout.stride[1]; auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1], n2 = param[1].layout.shape[2]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); size_t toff = 0; for (size_t i = 0; i < n0; ++i) { for (size_t j = 0; j < n1; ++j) { @@ -165,7 +191,14 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { auto as = param[0].layout.stride[1], bs = param[1].layout.stride[0]; auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1], n2 = param[0].layout.shape[2]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); size_t toff = 0; for (size_t i = 0; i < n0; ++i) { for (size_t j = 0; j < n1; ++j) { @@ -188,8 +221,9 @@ void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { } void ElemwiseImpl::exec(const TensorNDArray& srcs, _megdnn_tensor_out dst) { - if (!dst.layout.is_contiguous()) + if (!dst.layout.is_contiguous()) { return naive::ElemwiseForwardImpl::exec(srcs, dst); + } m_src = &srcs; m_dst = &dst; diff --git a/dnn/src/fallback/elemwise_multi_type/opr_impl.cpp b/dnn/src/fallback/elemwise_multi_type/opr_impl.cpp index 956390df..95dbbc22 100644 --- a/dnn/src/fallback/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/fallback/elemwise_multi_type/opr_impl.cpp @@ -18,20 +18,20 @@ using namespace megdnn; using namespace fallback; void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) { + const ElemwiseOpParamN<3>& param, const TensorND& dst) { BroadcastChannelInfo binfo0, binfo1; if (is_vector(param[0].layout) && is_broadcasted_channel_like(param[1].layout, binfo0) && is_broadcasted_channel_like(param[2].layout, binfo1) && binfo0 == binfo1) { - auto pa = param[0].ptr(); - auto pb = param[1].ptr(); - auto pc = param[2].ptr(); auto x = binfo0.x, y = binfo0.y, z = binfo0.z; - auto work = [pa, pb, pc, dst, x, y, z]() { - const dt_int16* __restrict__ a = pa; - const dt_int32* __restrict__ b = pb; - const dt_int32* __restrict__ c = pc; - dt_int32* __restrict__ d = dst; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [=]() { + const dt_int16* __restrict__ a = static_cast(src0.raw_ptr()); + const dt_int32* __restrict__ b = static_cast(src1.raw_ptr()); + const dt_int32* __restrict__ c = static_cast(src2.raw_ptr()); + dt_int32* __restrict__ d = dst.ptr(); for (size_t j = 0; j < y; ++j) { auto bv = b[j], cv = c[j]; for (size_t i = 0; i < x; ++i) { @@ -58,17 +58,18 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( template void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8_bcast_1x( - const ElemwiseOpParamN<3>& param, const Broadcast1xInfo& binfo, dt_int8* dst) { - auto pa = param[0].ptr(); - auto pb = param[1].ptr(); - auto pc = param[2].ptr(); + const ElemwiseOpParamN<3>& param, const Broadcast1xInfo& binfo, + const TensorND& dst) { size_t x = binfo.x, y = binfo.y; - auto work = [pa, pb, pc, dst, x, y]() { + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [=]() { elemwise_multi_type::Fma3iXxf32xf32xiYOp op; - const ctype* __restrict__ a = pa; - const dt_float32* __restrict__ b = pb; - const dt_float32* __restrict__ c = pc; - dt_int8* __restrict__ d = dst; + const ctype* __restrict__ a = src0.ptr(); + const dt_float32* __restrict__ b = static_cast(src1.raw_ptr()); + const dt_float32* __restrict__ c = static_cast(src2.raw_ptr()); + dt_int8* __restrict__ d = dst.ptr(); for (size_t i = 0; i < x; ++i) { size_t j = 0; for (; j + 4 <= y; j += 4) { @@ -90,7 +91,7 @@ void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8_bcast_1x( } void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) { + const ElemwiseOpParamN<3>& param, const TensorND& dst) { Broadcast1xInfo binfo0, binfo1; if (is_vector(param[0].layout) && is_broadcasted_1x(param[1].layout, binfo0) && is_broadcasted_1x(param[2].layout, binfo1) && binfo0 == binfo1) { @@ -107,27 +108,18 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( } // fallback to naive - switch (param[0].layout.dtype.enumv()) { -#define cb(t) \ - case DTypeTrait::enumv: \ - return naive::ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8< \ - DTypeTrait::ctype>(param, dst); - MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) -#undef cb - default: - megdnn_throw("unsupported src dtype"); - } + naive::ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8(param, dst); } template void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX_bcast_scalar( - const ElemwiseOpParamN<2>& param, dst_ctype* dst) { - auto x_ptr = param[0].ptr(); + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + auto src = param[0]; auto k = param[1].ptr()[0]; size_t size = param.size; - auto work = [x_ptr, k, size, dst]() { - const ctype* __restrict__ xp = x_ptr; - dst_ctype* __restrict__ dp = dst; + auto work = [src, k, size, dst]() { + const ctype* __restrict__ xp = src.ptr(); + dst_ctype* __restrict__ dp = dst.ptr(); for (size_t i = 0; i < size; i++) { dp[i] = elemwise_multi_type::round_shr_saturate(xp[i], k); } @@ -137,7 +129,7 @@ void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX_bcast_scalar( } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ @@ -157,7 +149,7 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, megdnn::dt_int16* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ @@ -179,23 +171,25 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( template void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate_bcast_1c11( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst, + const ElemwiseOpParamN<6>& param, const TensorND& dst, const BroadcastChannelInfo& broadcast_info) { - auto work = [param, dst, broadcast_info]() { - auto x_ptr = param[0].ptr(); - auto b_ptr = param[1].ptr(); - auto M = param[2].ptr()[0]; - auto k = param[3].ptr()[0]; - auto minv = param[4].ptr()[0]; - auto maxv = param[5].ptr()[0]; - auto dst_ptr = dst; + auto x = param[0]; + auto b = param[1]; + auto M = param[2].ptr()[0]; + auto k = param[3].ptr()[0]; + auto minv = param[4].ptr()[0]; + auto maxv = param[5].ptr()[0]; + auto work = [=]() { auto batch_stride = broadcast_info.y * broadcast_info.z; auto channel_stride = broadcast_info.z; + auto x_ptr = static_cast(x.raw_ptr()); + auto dst_ptr = static_cast(dst.raw_ptr()); for (size_t n = 0; n < broadcast_info.x; n++) { const ctype* __restrict__ xp = x_ptr; + auto b_ptr = static_cast(b.raw_ptr()); dt_int8* __restrict__ dp = dst_ptr; for (size_t chan = 0; chan < broadcast_info.y; chan++) { - const ctype bias = b_ptr[chan * param[1].layout.stride[1]]; + const ctype bias = b_ptr[chan * b.layout.stride[1]]; for (size_t i = 0; i < broadcast_info.z; i++) { auto res = elemwise_multi_type::round_shr_saturate( round_mulh_saturate(xp[i] + bias, M), k); @@ -215,7 +209,7 @@ void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate_bcast_1c1 } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { bool all_scalar = true; for (int i = 3; i < 6; i++) { all_scalar &= is_broadcasted_scalar(param[i].layout); @@ -229,7 +223,7 @@ void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { bool all_scalar = true; for (int i = 3; i < 6; i++) { all_scalar &= is_broadcasted_scalar(param[i].layout); diff --git a/dnn/src/fallback/elemwise_multi_type/opr_impl.h b/dnn/src/fallback/elemwise_multi_type/opr_impl.h index 93cc1e4f..91df43fe 100644 --- a/dnn/src/fallback/elemwise_multi_type/opr_impl.h +++ b/dnn/src/fallback/elemwise_multi_type/opr_impl.h @@ -20,29 +20,29 @@ class ElemwiseMultiTypeImpl : public naive::ElemwiseMultiTypeImpl { template void dispatch_fma3_iXxf32xf32xi8_bcast_1x( const ElemwiseOpParamN<3>& param, const Broadcast1xInfo& binfo, - dt_int8* dst); + const TensorND& dst); template void dispatch_round_shr_saturate_iXxi8xiX_bcast_scalar( - const ElemwiseOpParamN<2>& param, dst_ctype* dst); + const ElemwiseOpParamN<2>& param, const TensorND& dst); template void dispatch_fuse_add_rmulh_round_shr_saturate_bcast_1c11( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst, + const ElemwiseOpParamN<6>& param, const TensorND& dst, const BroadcastChannelInfo& broadcast_info); protected: void on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; public: using naive::ElemwiseMultiTypeImpl::ElemwiseMultiTypeImpl; diff --git a/dnn/src/fallback/group_local/opr_impl.cpp b/dnn/src/fallback/group_local/opr_impl.cpp index c5c371e5..77ed7a97 100644 --- a/dnn/src/fallback/group_local/opr_impl.cpp +++ b/dnn/src/fallback/group_local/opr_impl.cpp @@ -58,19 +58,18 @@ void GroupLocalImpl::exec( auto kern = [fp, nr_group, kptr, flt_gstride, data_type_size_in_bytes]() { auto cur_fp = fp; + cur_fp.src = RefPtr(); + cur_fp.filter = RefPtr(); + cur_fp.dst = RefPtr(); rep(g, nr_group) { auto ic = g * fp.ic; auto oc = g * fp.oc; - const int8_t* sptr_tmp = reinterpret_cast(fp.src); - const int8_t* fptr_tmp = reinterpret_cast(fp.filter); - int8_t* dptr_tmp = reinterpret_cast(fp.dst); - - sptr_tmp = sptr_tmp + ic * fp.ih * fp.iw * data_type_size_in_bytes; - fptr_tmp = fptr_tmp + g * flt_gstride * data_type_size_in_bytes; - dptr_tmp = dptr_tmp + oc * fp.oh * fp.ow * data_type_size_in_bytes; - cur_fp.src = static_cast(sptr_tmp); - cur_fp.filter = static_cast(fptr_tmp); - cur_fp.dst = static_cast(dptr_tmp); + cur_fp.src = fp.src; + cur_fp.filter = fp.filter; + cur_fp.dst = fp.dst; + cur_fp.src += ic * fp.ih * fp.iw * data_type_size_in_bytes; + cur_fp.filter += g * flt_gstride * data_type_size_in_bytes; + cur_fp.dst += oc * fp.oh * fp.ow * data_type_size_in_bytes; kptr(cur_fp); } }; diff --git a/dnn/src/fallback/matrix_mul/algos.cpp b/dnn/src/fallback/matrix_mul/algos.cpp index b0b995f3..da02af79 100644 --- a/dnn/src/fallback/matrix_mul/algos.cpp +++ b/dnn/src/fallback/matrix_mul/algos.cpp @@ -66,14 +66,14 @@ void kern_naive(const MatrixMulImpl::KernParam& kern_param) { "M and N must time of pack_size M: %zu N: %zu pack_size: %zu", M, N, pack_size); -#define DISPATCH(TA, TB) \ - if (kern_param.trA == TA && kern_param.trB == TB) { \ - naive::dispatch_ta_tb( \ - kern_param.A_ptr, kern_param.B_ptr, kern_param.C_ptr, \ - kern_param.workspace_ptr, M / pack_size, N, K / pack_size, LDA, LDB, \ - LDC, kern_param.A_type, kern_param.B_type, kern_param.C_type, \ - kern_param.format, kern_param.compute_mode); \ - return; \ +#define DISPATCH(TA, TB) \ + if (kern_param.trA == TA && kern_param.trB == TB) { \ + naive::dispatch_ta_tb( \ + kern_param.A_ptr.get_ptr(), kern_param.B_ptr.get_ptr(), \ + kern_param.C_ptr.get_ptr(), kern_param.workspace_ptr, M / pack_size, \ + N, K / pack_size, LDA, LDB, LDC, kern_param.A_type, kern_param.B_type, \ + kern_param.C_type, kern_param.format, kern_param.compute_mode); \ + return; \ } DISPATCH(true, true); DISPATCH(true, false); diff --git a/dnn/src/fallback/matrix_mul/opr_impl.cpp b/dnn/src/fallback/matrix_mul/opr_impl.cpp index 7622c850..3a46d5a0 100644 --- a/dnn/src/fallback/matrix_mul/opr_impl.cpp +++ b/dnn/src/fallback/matrix_mul/opr_impl.cpp @@ -215,9 +215,9 @@ MatrixMulImpl::KernParam MatrixMulImpl::make_kern_param( KernParam kern_param; static_cast(kern_param) = make_kern_size_param(A.layout, B.layout, C.layout); - kern_param.A_ptr = A.raw_ptr; - kern_param.B_ptr = B.raw_ptr; - kern_param.C_ptr = C.raw_ptr; + kern_param.A_ptr = A.get_ref_ptr(); + kern_param.B_ptr = B.get_ref_ptr(); + kern_param.C_ptr = C.get_ref_ptr(); kern_param.workspace_ptr = workspace.raw_ptr; kern_param.workspace_size = workspace.size; return kern_param; diff --git a/dnn/src/fallback/matrix_mul/opr_impl.h b/dnn/src/fallback/matrix_mul/opr_impl.h index ec11f719..b74de936 100644 --- a/dnn/src/fallback/matrix_mul/opr_impl.h +++ b/dnn/src/fallback/matrix_mul/opr_impl.h @@ -51,28 +51,28 @@ public: }; struct KernParam : public KernSizeParam { - const void* A_ptr; - const void* B_ptr; - void* C_ptr; - void* workspace_ptr; - size_t workspace_size; + RefPtr A_ptr; + RefPtr B_ptr; + RefPtr C_ptr; + void* workspace_ptr = nullptr; + size_t workspace_size = 0; template inline const T* A() const { // A_type.assert_is_compatible_ctype(); - return static_cast(A_ptr); + return static_cast(A_ptr.get_ptr()); } template inline const T* B() const { // B_type.assert_is_compatible_ctype(); - return static_cast(B_ptr); + return static_cast(B_ptr.get_ptr()); } template inline T* C() const { // C_type.assert_is_compatible_ctype(); - return static_cast(C_ptr); + return static_cast(C_ptr.get_ptr()); } template inline T* workspace() const { diff --git a/dnn/src/fallback/pooling/opr_impl.cpp b/dnn/src/fallback/pooling/opr_impl.cpp index 17855ce1..3e9f8524 100644 --- a/dnn/src/fallback/pooling/opr_impl.cpp +++ b/dnn/src/fallback/pooling/opr_impl.cpp @@ -143,15 +143,16 @@ void w2x2_s2x2_avg_int8( namespace megdnn { namespace fallback { -void PoolingImpl::exec_w3x3_s1x1(_megdnn_tensor_in src, _megdnn_tensor_out dst) { +void PoolingImpl::exec_w3x3_s1x1( + _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param) { auto N = src.layout.shape[0], C = src.layout.shape[1]; auto IH = src.layout.shape[2], IW = src.layout.shape[3]; auto OH = dst.layout.shape[2], OW = dst.layout.shape[3]; for (size_t nc = 0; nc < N * C; ++nc) { pooling::w3x3_s1x1( src.ptr() + nc * IH * IW, - dst.ptr() + nc * OH * OW, IH, IW, OH, OW, param().pad_h, - param().pad_w); + dst.ptr() + nc * OH * OW, IH, IW, OH, OW, param.pad_h, + param.pad_w); } } @@ -180,22 +181,23 @@ void PoolingImpl::exec_w2x2_s2x2_avg_int8( void PoolingImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + Param param = this->param(); check_exec(src.layout, dst.layout, workspace.size); - if (src.layout.dtype == dtype::Float32() && param().format == Param::Format::NCHW && - param().mode == Mode::MAX && param().window_h == 3 && param().window_w == 3 && - param().stride_h == 1 && param().stride_w == 1 && param().pad_h <= 2 && - param().pad_w <= 2) { + if (src.layout.dtype == dtype::Float32() && param.format == Param::Format::NCHW && + param.mode == Mode::MAX && param.window_h == 3 && param.window_w == 3 && + param.stride_h == 1 && param.stride_w == 1 && param.pad_h <= 2 && + param.pad_w <= 2) { MIDOUT_BEGIN(megdnn_fallback_pooling, midout_iv(0)) { - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_w3x3_s1x1(src, dst)); + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_w3x3_s1x1(src, dst, param)); } MIDOUT_END(); return; } // regular int conv case - if (src.layout.dtype == dtype::Int8() && param().mode == Mode::MAX && - param().format == Param::Format::NCHW && param().window_h == 2 && - param().window_w == 2 && param().stride_h == 2 && param().stride_w == 2 && - param().pad_h == 0 && param().pad_w == 0) { + if (src.layout.dtype == dtype::Int8() && param.mode == Mode::MAX && + param.format == Param::Format::NCHW && param.window_h == 2 && + param.window_w == 2 && param.stride_h == 2 && param.stride_w == 2 && + param.pad_h == 0 && param.pad_w == 0) { MIDOUT_BEGIN(megdnn_fallback_pooling, midout_iv(1)) { MEGDNN_DISPATCH_CPU_KERN_OPR(exec_w2x2_s2x2_int8(src, dst)); } @@ -203,10 +205,10 @@ void PoolingImpl::exec( return; } // int8 2x2 AVERAGE case - if (src.layout.dtype == dtype::Int8() && param().mode == Mode::AVERAGE && - param().format == Param::Format::NCHW && param().window_h == 2 && - param().window_w == 2 && param().stride_h == 2 && param().stride_w == 2 && - param().pad_h == 0 && param().pad_w == 0) { + if (src.layout.dtype == dtype::Int8() && param.mode == Mode::AVERAGE && + param.format == Param::Format::NCHW && param.window_h == 2 && + param.window_w == 2 && param.stride_h == 2 && param.stride_w == 2 && + param.pad_h == 0 && param.pad_w == 0) { MIDOUT_BEGIN(megdnn_fallback_pooling, midout_iv(2)) { MEGDNN_DISPATCH_CPU_KERN_OPR(exec_w2x2_s2x2_avg_int8(src, dst)); } diff --git a/dnn/src/fallback/pooling/opr_impl.h b/dnn/src/fallback/pooling/opr_impl.h index 92641dbb..a44277d8 100644 --- a/dnn/src/fallback/pooling/opr_impl.h +++ b/dnn/src/fallback/pooling/opr_impl.h @@ -19,12 +19,14 @@ namespace fallback { class PoolingImpl : public naive::PoolingForwardImpl { public: using naive::PoolingForwardImpl::PoolingForwardImpl; + using Param = param::Pooling; void exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; private: - void exec_w3x3_s1x1(_megdnn_tensor_in src, _megdnn_tensor_out dst); + void exec_w3x3_s1x1( + _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param); void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); }; diff --git a/dnn/src/fallback/powc/opr_impl.cpp b/dnn/src/fallback/powc/opr_impl.cpp index 6e68ddf3..623316bb 100644 --- a/dnn/src/fallback/powc/opr_impl.cpp +++ b/dnn/src/fallback/powc/opr_impl.cpp @@ -189,17 +189,17 @@ void PowCImpl::do_exec_ct( _megdnn_tensor_in src, _megdnn_tensor_out dst, const float* exp_f, const int* exp_i) { auto handle = static_cast(this->handle()); - auto sptr = reinterpret_cast(src.raw_ptr); - auto dptr = reinterpret_cast(dst.raw_ptr); auto size = src.layout.total_nr_elems(); -#define CALL(_expfunc) \ - do { \ - auto kern = [sptr, dptr, size, expfunc = _expfunc]() { \ - pow_invoke(sptr, dptr, size, expfunc); \ - }; \ - handle->dispatch_kern(kern); \ - return; \ +#define CALL(_expfunc) \ + do { \ + auto kern = [src, dst, size, expfunc = _expfunc]() { \ + auto sptr = reinterpret_cast(src.raw_ptr()); \ + auto dptr = reinterpret_cast(dst.raw_ptr()); \ + pow_invoke(sptr, dptr, size, expfunc); \ + }; \ + handle->dispatch_kern(kern); \ + return; \ } while (0) if (exp_f) { float fv = *exp_f; diff --git a/dnn/src/fallback/reduce/opr_impl.cpp b/dnn/src/fallback/reduce/opr_impl.cpp index 0c420385..78192424 100644 --- a/dnn/src/fallback/reduce/opr_impl.cpp +++ b/dnn/src/fallback/reduce/opr_impl.cpp @@ -82,15 +82,14 @@ void ReduceImpl::exec( check_exec(src.layout, dst.layout, workspace.size); size_t A, B, C; get_ABC(src.layout, A, B, C, param().axis); -#define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \ - if (param().mode == mode_) { \ - typedef DTypeTrait::ctype src_ctype; \ - typedef DTypeTrait::ctype dst_ctype; \ - typedef DTypeTrait<_wtype>::ctype wtype; \ - Op_ op( \ - src.ptr(), dst.ptr(), B); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(kern_func); \ - return; \ +#define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \ + if (param().mode == mode_) { \ + typedef DTypeTrait::ctype src_ctype; \ + typedef DTypeTrait::ctype dst_ctype; \ + typedef DTypeTrait<_wtype>::ctype wtype; \ + Op_ op(src.get_ref_ptr(), dst.get_ref_ptr(), B); \ + MEGDNN_DISPATCH_CPU_KERN_OPR({ kern_func; }); \ + return; \ } #define cb_by_dtype(dtype_, kern_func, type_tuple) \ if (dtype_() == src.layout.dtype) { \ diff --git a/dnn/src/fallback/relayout/opr_impl.cpp b/dnn/src/fallback/relayout/opr_impl.cpp index d301acbc..b2b878e8 100644 --- a/dnn/src/fallback/relayout/opr_impl.cpp +++ b/dnn/src/fallback/relayout/opr_impl.cpp @@ -61,8 +61,6 @@ template void dispatch_on_dtype_cont( Handle* handle, const TensorND& cont, const TensorND& nonc, memcpy_policy_t mcp_pol) { - auto ctptr = static_cast(cont.raw_ptr), - ncptr = static_cast(nonc.raw_ptr); thin_function kern; switch (nonc.layout.ndim) { case 2: { @@ -70,8 +68,8 @@ void dispatch_on_dtype_cont( auto strd0_n = nonc.layout.stride[0] * sizeof(ctype); auto strd0_c = shp1 * sizeof(ctype); kern = [=]() { - auto cur_ctptr = ctptr; - auto cur_ncptr = ncptr; + auto cur_ctptr = static_cast(cont.raw_ptr()); + auto cur_ncptr = static_cast(nonc.raw_ptr()); for (size_t i = 0; i < shp0; ++i) { mcp_pol(cur_ctptr, cur_ncptr, strd0_c); cur_ctptr += strd0_c; @@ -87,8 +85,8 @@ void dispatch_on_dtype_cont( strd1_n = nonc.layout.stride[1] * sizeof(ctype); auto strd1_c = shp2 * sizeof(ctype); kern = [=]() { - auto cur_ctptr = ctptr; - auto ncptr_row = ncptr; + auto cur_ctptr = static_cast(cont.raw_ptr()); + auto ncptr_row = static_cast(nonc.raw_ptr()); for (size_t i = 0; i < shp0; ++i) { auto cur_ncptr = ncptr_row; for (size_t j = 0; j < shp1; ++j) { @@ -249,71 +247,73 @@ void RelayoutForwardImpl::exec( void RelayoutForwardImpl::exec_after_preprocess( const TensorND& src, const TensorND& dst, relayout::TransposeParam* transpose) { if (transpose) { - auto dsize = src.layout.dtype.size() * transpose->c; - void (*kptr)(size_t, size_t, size_t, size_t, void*, void*, size_t) = nullptr; - auto src_addr = reinterpret_cast(src.raw_ptr), - dst_addr = reinterpret_cast(dst.raw_ptr); - if (dsize == 1) { - megdnn_assert(transpose->c == 1); - kptr = call_transpose; - } else if (dsize == 2) { - transpose->c = 1; - if (!((src_addr | dst_addr) & (alignof(uint16_t) - 1))) { - kptr = call_transpose; - } else { - kptr = call_transpose>; - megdnn_log_error("unaligned addr in relayout"); - } - } else if (dsize == 3) { - transpose->c = 1; - kptr = call_transpose>; - } else if (dsize == 4) { - transpose->c = 1; - if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) { - kptr = call_transpose; - } else { - kptr = call_transpose>; - megdnn_log_error("unaligned addr in relayout"); - } - } else if (dsize == 12) { - transpose->c = 1; - if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) { - kptr = call_transpose>; - } else { - kptr = call_transpose>; - megdnn_log_error("unaligned addr in relayout"); - } - } else if (dsize <= TRANSPOSE_CV_MAX_C) { - switch (dst.layout.dtype.enumv()) { + auto kernel = [tparam = *transpose, src, dst]() { + auto t = tparam; + auto dsize = src.layout.dtype.size() * t.c; + void (*kptr)(size_t, size_t, size_t, size_t, void*, void*, size_t) = + nullptr; + auto src_addr = reinterpret_cast(src.raw_ptr()), + dst_addr = reinterpret_cast(dst.raw_ptr()); + if (dsize == 1) { + megdnn_assert(t.c == 1); + kptr = call_transpose; + } else if (dsize == 2) { + t.c = 1; + if (!((src_addr | dst_addr) & (alignof(uint16_t) - 1))) { + kptr = call_transpose; + } else { + kptr = call_transpose>; + megdnn_log_error("unaligned addr in relayout"); + } + } else if (dsize == 3) { + t.c = 1; + kptr = call_transpose>; + } else if (dsize == 4) { + t.c = 1; + if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) { + kptr = call_transpose; + } else { + kptr = call_transpose>; + megdnn_log_error("unaligned addr in relayout"); + } + } else if (dsize == 12) { + t.c = 1; + if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) { + kptr = call_transpose>; + } else { + kptr = call_transpose>; + megdnn_log_error("unaligned addr in relayout"); + } + } else if (dsize <= TRANSPOSE_CV_MAX_C) { + switch (dst.layout.dtype.enumv()) { #define cb(_dt) \ case DTypeTrait::enumv: \ kptr = transpose_cv::type>; \ break; - MEGDNN_FOREACH_DTYPE_NAME(cb) - MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) + MEGDNN_FOREACH_DTYPE_NAME(cb) + MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) #undef cb + } + megdnn_assert(kptr); } - megdnn_assert(kptr); - } - if (kptr) { - auto kern = [t = *transpose, sptr = src.raw_ptr, dptr = dst.raw_ptr, - kptr]() { + if (kptr) { + auto sptr = src.raw_ptr(); + auto dptr = dst.raw_ptr(); kptr(t.batch, t.m, t.n, t.c, sptr, dptr, t.stride_m); - }; - static_cast(handle())->dispatch_kern(kern); - return; - } else { - megdnn_assert(transpose->c != 1, "unsupported dtype size"); - } + return; + } else { + megdnn_assert(t.c != 1, "unsupported dtype size"); + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(kernel()); } using relayout::is_contig; if (is_contig(dst.layout) && is_contig(src.layout)) { - auto sptr = src.raw_ptr, dptr = dst.raw_ptr; auto sz = src.layout.span().dist_byte(); - MEGDNN_DISPATCH_CPU_KERN_OPR(memcpy(dptr, sptr, sz)); + MEGDNN_DISPATCH_CPU_KERN_OPR(memcpy(dst.raw_ptr(), src.raw_ptr(), sz)); return; } diff --git a/dnn/src/fallback/repeat/opr_impl.cpp b/dnn/src/fallback/repeat/opr_impl.cpp index ec37251e..51913573 100644 --- a/dnn/src/fallback/repeat/opr_impl.cpp +++ b/dnn/src/fallback/repeat/opr_impl.cpp @@ -33,7 +33,7 @@ void RepeatImpl::exec( auto nr_reduces = count_not_ones_in_shape(times); if (nr_reduces == 0) { MEGDNN_DISPATCH_CPU_KERN_OPR(std::memcpy( - dst_.raw_ptr, src_.raw_ptr, sizeof(float) * dst.total_nr_elems())); + dst_.raw_ptr(), src_.raw_ptr(), sizeof(float) * dst.total_nr_elems())); return; } diff --git a/dnn/src/fallback/roi_copy/opr_impl.cpp b/dnn/src/fallback/roi_copy/opr_impl.cpp index 89b66272..76a1725c 100644 --- a/dnn/src/fallback/roi_copy/opr_impl.cpp +++ b/dnn/src/fallback/roi_copy/opr_impl.cpp @@ -13,6 +13,7 @@ #include "src/fallback/handle.h" #include "src/common/cv/common.h" +#include "src/common/opr_delegate.h" #include "src/common/utils.h" #include @@ -27,16 +28,23 @@ void ROICopyImpl::exec( OC = dst.layout.shape[3]; ptrdiff_t istride0 = src.layout.stride[0], istride1 = src.layout.stride[1], istride2 = src.layout.stride[2], istride3 = src.layout.stride[3]; - - TensorLayout relayout_src_layout( - {N, OH, OW, OC}, {istride0, istride1, istride2, istride3}, - src.layout.dtype); - TensorND relayout_src( - static_cast(src.raw_ptr) + - (param().row_from * istride1 + param().col_from * istride2) * - src.layout.dtype.size(), - relayout_src_layout); - static_cast(handle())->relayout_opr()->exec(relayout_src, dst); + auto row_from = param().row_from; + auto col_from = param().col_from; + + auto kern = [=]() { + TensorLayout relayout_src_layout( + {N, OH, OW, OC}, {istride0, istride1, istride2, istride3}, + src.layout.dtype); + TensorND relayout_src( + static_cast(src.raw_ptr()) + + (row_from * istride1 + col_from * istride2) * + src.layout.dtype.size(), + relayout_src_layout); + + auto relayout = inplace_cpu_handle(0)->create_operator(); + relayout->exec(relayout_src, dst); + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(kern()); } } // namespace fallback diff --git a/dnn/src/fallback/rotate/opr_impl.cpp b/dnn/src/fallback/rotate/opr_impl.cpp index 8c7cc8e3..8c56b196 100644 --- a/dnn/src/fallback/rotate/opr_impl.cpp +++ b/dnn/src/fallback/rotate/opr_impl.cpp @@ -109,27 +109,31 @@ void RotateImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_workspace workspace) { using namespace megcv; check_exec(src.layout, dst.layout, workspace.size); - - MEGDNN_DISPATCH_CPU_KERN_OPR( - if (dst.layout.dtype == dtype::Float32()) { - for (size_t i = 0; i < src.layout.shape[0]; ++i) { - Mat src_mat = TensorND2Mat(src, i); - Mat dst_mat = TensorND2Mat(dst, i); - rotate_intl::rotate(src_mat, dst_mat, param().clockwise); - } - } else if (dst.layout.dtype == dtype::Int32()) { - for (size_t i = 0; i < src.layout.shape[0]; ++i) { - Mat src_mat = TensorND2Mat(src, i); - Mat dst_mat = TensorND2Mat(dst, i); - rotate_intl::rotate(src_mat, dst_mat, param().clockwise); - } - } else if (dst.layout.dtype == dtype::Uint8()) { - for (size_t i = 0; i < src.layout.shape[0]; ++i) { - Mat src_mat = TensorND2Mat(src, i); - Mat dst_mat = TensorND2Mat(dst, i); - rotate_intl::rotate(src_mat, dst_mat, param().clockwise); - } - } else { megdnn_throw("Unsupported datatype of Rotate optr."); }); + auto clockwise = param().clockwise; + auto run = [src, dst, clockwise]() { + if (dst.layout.dtype == dtype::Float32()) { + for (size_t i = 0; i < src.layout.shape[0]; ++i) { + Mat src_mat = TensorND2Mat(src, i); + Mat dst_mat = TensorND2Mat(dst, i); + rotate_intl::rotate(src_mat, dst_mat, clockwise); + } + } else if (dst.layout.dtype == dtype::Int32()) { + for (size_t i = 0; i < src.layout.shape[0]; ++i) { + Mat src_mat = TensorND2Mat(src, i); + Mat dst_mat = TensorND2Mat(dst, i); + rotate_intl::rotate(src_mat, dst_mat, clockwise); + } + } else if (dst.layout.dtype == dtype::Uint8()) { + for (size_t i = 0; i < src.layout.shape[0]; ++i) { + Mat src_mat = TensorND2Mat(src, i); + Mat dst_mat = TensorND2Mat(dst, i); + rotate_intl::rotate(src_mat, dst_mat, clockwise); + } + } else { + megdnn_throw("Unsupported datatype of Rotate optr."); + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(run()); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/tile/opr_impl.cpp b/dnn/src/fallback/tile/opr_impl.cpp index ac09572f..7db9a177 100644 --- a/dnn/src/fallback/tile/opr_impl.cpp +++ b/dnn/src/fallback/tile/opr_impl.cpp @@ -33,7 +33,7 @@ void TileImpl::exec( auto nr_reduces = count_not_ones_in_shape(times); if (nr_reduces == 0) { MEGDNN_DISPATCH_CPU_KERN_OPR(std::memcpy( - dst_.raw_ptr, src_.raw_ptr, sizeof(float) * dst.total_nr_elems())); + dst_.raw_ptr(), src_.raw_ptr(), sizeof(float) * dst.total_nr_elems())); return; } diff --git a/dnn/src/fallback/type_cvt/opr_impl.cpp b/dnn/src/fallback/type_cvt/opr_impl.cpp index 5cae3fbf..090aeac3 100644 --- a/dnn/src/fallback/type_cvt/opr_impl.cpp +++ b/dnn/src/fallback/type_cvt/opr_impl.cpp @@ -51,7 +51,7 @@ struct TypeCvt { using sctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); const sctype* __restrict sptr = src.ptr(); - FLOAT16* __restrict dptr = static_cast(dst.raw_ptr); + FLOAT16* __restrict dptr = static_cast(dst.raw_ptr()); for (size_t i = 0; i < n; ++i) { dptr[i] = static_cast(sptr[i]); } @@ -63,7 +63,7 @@ struct TypeCvt { static void do_cvt(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); using dctype = typename DTypeTrait::ctype; - const FLOAT16* __restrict sptr = static_cast(src.raw_ptr); + const FLOAT16* __restrict sptr = static_cast(src.raw_ptr()); dctype* __restrict dptr = dst.ptr(); for (size_t i = 0; i < n; ++i) { dptr[i] = static_cast(sptr[i]); @@ -75,8 +75,8 @@ template <> struct TypeCvt { static void do_cvt(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const FLOAT16* __restrict sptr = static_cast(src.raw_ptr); - FLOAT16* __restrict dptr = static_cast(dst.raw_ptr); + const FLOAT16* __restrict sptr = static_cast(src.raw_ptr()); + FLOAT16* __restrict dptr = static_cast(dst.raw_ptr()); for (size_t i = 0; i < n; ++i) { dptr[i] = static_cast(sptr[i]); } @@ -92,7 +92,7 @@ void do_cvt_normal_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using sctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); const sctype* __restrict sptr = src.ptr(); - int8_t* __restrict dptr = static_cast(dst.raw_ptr); + int8_t* __restrict dptr = static_cast(dst.raw_ptr()); float scale = dst.layout.dtype.param().scale; float dscale = 1.f / scale; for (size_t i = 0; i < n; ++i) { @@ -105,7 +105,7 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using sctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); const sctype* __restrict sptr = src.ptr(); - int32_t* __restrict dptr = static_cast(dst.raw_ptr); + int32_t* __restrict dptr = static_cast(dst.raw_ptr()); float scale = dst.layout.dtype.param().scale; float dscale = 1.f / scale; for (size_t i = 0; i < n; ++i) { @@ -121,7 +121,7 @@ void do_cvt_normal_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using sctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); const sctype* __restrict sptr = src.ptr(); - uint8_t* __restrict dptr = static_cast(dst.raw_ptr); + uint8_t* __restrict dptr = static_cast(dst.raw_ptr()); float scale = dst.layout.dtype.param().scale; uint8_t zp = dst.layout.dtype.param().zero_point; float dscale = 1.f / scale; @@ -134,7 +134,7 @@ template void do_cvt_s8_normal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using dctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); - const int8_t* __restrict sptr = static_cast(src.raw_ptr); + const int8_t* __restrict sptr = static_cast(src.raw_ptr()); dctype* __restrict dptr = dst.ptr(); float scale = src.layout.dtype.param().scale; for (size_t i = 0; i < n; ++i) { @@ -147,7 +147,7 @@ template void do_cvt_s32_normal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using dctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); - const int32_t* __restrict sptr = static_cast(src.raw_ptr); + const int32_t* __restrict sptr = static_cast(src.raw_ptr()); dctype* __restrict dptr = dst.ptr(); float scale = src.layout.dtype.param().scale; for (size_t i = 0; i < n; ++i) { @@ -160,7 +160,7 @@ template void do_cvt_asymm8_normal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { using dctype = typename DTypeTrait::ctype; auto n = src.layout.total_nr_elems(); - const uint8_t* __restrict sptr = static_cast(src.raw_ptr); + const uint8_t* __restrict sptr = static_cast(src.raw_ptr()); dctype* __restrict dptr = dst.ptr(); float scale = src.layout.dtype.param().scale; uint8_t zp = src.layout.dtype.param().zero_point; @@ -172,8 +172,8 @@ void do_cvt_asymm8_normal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s8_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int8_t* __restrict sptr = static_cast(src.raw_ptr); - int8_t* __restrict dptr = static_cast(dst.raw_ptr); + const int8_t* __restrict sptr = static_cast(src.raw_ptr()); + int8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; @@ -184,8 +184,8 @@ void do_cvt_s8_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s32_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int32_t* __restrict sptr = static_cast(src.raw_ptr); - int8_t* __restrict dptr = static_cast(dst.raw_ptr); + const int32_t* __restrict sptr = static_cast(src.raw_ptr()); + int8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; @@ -196,8 +196,8 @@ void do_cvt_s32_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_asymm8_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const uint8_t* __restrict sptr = static_cast(src.raw_ptr); - int8_t* __restrict dptr = static_cast(dst.raw_ptr); + const uint8_t* __restrict sptr = static_cast(src.raw_ptr()); + int8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; uint8_t src_zp = src.layout.dtype.param().zero_point; float dst_scale = dst.layout.dtype.param().scale; @@ -210,8 +210,8 @@ void do_cvt_asymm8_s8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int8_t* __restrict sptr = static_cast(src.raw_ptr); - int32_t* __restrict dptr = static_cast(dst.raw_ptr); + const int8_t* __restrict sptr = static_cast(src.raw_ptr()); + int32_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; @@ -225,8 +225,8 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int32_t* __restrict sptr = static_cast(src.raw_ptr); - int32_t* __restrict dptr = static_cast(dst.raw_ptr); + const int32_t* __restrict sptr = static_cast(src.raw_ptr()); + int32_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; @@ -240,8 +240,8 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const uint8_t* __restrict sptr = static_cast(src.raw_ptr); - int32_t* __restrict dptr = static_cast(dst.raw_ptr); + const uint8_t* __restrict sptr = static_cast(src.raw_ptr()); + int32_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; uint8_t src_zp = src.layout.dtype.param().zero_point; float dst_scale = dst.layout.dtype.param().scale; @@ -256,8 +256,8 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s8_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int8_t* __restrict sptr = static_cast(src.raw_ptr); - uint8_t* __restrict dptr = static_cast(dst.raw_ptr); + const int8_t* __restrict sptr = static_cast(src.raw_ptr()); + uint8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; uint8_t dst_zp = dst.layout.dtype.param().zero_point; @@ -270,8 +270,8 @@ void do_cvt_s8_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_s32_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const int32_t* __restrict sptr = static_cast(src.raw_ptr); - uint8_t* __restrict dptr = static_cast(dst.raw_ptr); + const int32_t* __restrict sptr = static_cast(src.raw_ptr()); + uint8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; float dst_scale = dst.layout.dtype.param().scale; uint8_t dst_zp = dst.layout.dtype.param().zero_point; @@ -284,8 +284,8 @@ void do_cvt_s32_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void do_cvt_asymm8_asymm8(_megdnn_tensor_in src, _megdnn_tensor_out dst) { auto n = src.layout.total_nr_elems(); - const uint8_t* __restrict sptr = static_cast(src.raw_ptr); - int8_t* __restrict dptr = static_cast(dst.raw_ptr); + const uint8_t* __restrict sptr = static_cast(src.raw_ptr()); + int8_t* __restrict dptr = static_cast(dst.raw_ptr()); float src_scale = src.layout.dtype.param().scale; uint8_t src_zp = src.layout.dtype.param().zero_point; float dst_scale = dst.layout.dtype.param().scale; diff --git a/dnn/src/fallback/warp_perspective/opr_impl.cpp b/dnn/src/fallback/warp_perspective/opr_impl.cpp index 25b896b9..d0651970 100644 --- a/dnn/src/fallback/warp_perspective/opr_impl.cpp +++ b/dnn/src/fallback/warp_perspective/opr_impl.cpp @@ -121,18 +121,25 @@ void WarpPerspectiveImpl::kern_fallback(const KernParam& kern_para KernParam sub_param = kern_param; sub_param.n_src = 1; sub_param.n_mat = 1; - sub_param.midx_ptr = nullptr; + sub_param.midx_ptr = RefPtr(); + sub_param.src_ptr = RefPtr(kern_param.src_ptr.get_ptr()); + sub_param.mat_ptr = RefPtr(kern_param.mat_ptr.get_ptr()); + sub_param.dst_ptr = RefPtr(kern_param.dst_ptr.get_ptr()); rep(n, N_MAT) { if (midx_ptr) { size_t idx = midx_ptr[n]; megdnn_assert( idx < N_SRC, "mat_idx out of bound: mat_idx[%zu]=%zu src_batch=%zu", n, idx, N_SRC); - sub_param.sptr = kern_param.sptr + idx * (C * IH * IW); + sub_param.src_ptr.reset( + static_cast(kern_param.src_ptr.get_ptr()) + + idx * (C * IH * IW)); } else if (n) { - sub_param.sptr += C * IH * IW; + sub_param.src_ptr.reset( + static_cast(kern_param.src_ptr.get_ptr()) + + n * C * IH * IW); } - if (is_resize_optimizable(sub_param.mptr)) { + if (is_resize_optimizable(static_cast(sub_param.mat_ptr.get_ptr()))) { if (bmode == BorderMode::CONSTANT) { MIDOUT_BEGIN( megdnn_fallback_warpperspective, midout_iv(1), midout_iv(true), @@ -154,8 +161,8 @@ void WarpPerspectiveImpl::kern_fallback(const KernParam& kern_para } MIDOUT_END(); } - sub_param.mptr += 3 * 3; - sub_param.dptr += C * OH * OW; + sub_param.mat_ptr += 3 * 3 * sizeof(mtype); + sub_param.dst_ptr += C * OH * OW * sizeof(ctype); } } diff --git a/dnn/src/naive/add_update/opr_impl.cpp b/dnn/src/naive/add_update/opr_impl.cpp index b96f20d5..3153c871 100644 --- a/dnn/src/naive/add_update/opr_impl.cpp +++ b/dnn/src/naive/add_update/opr_impl.cpp @@ -20,7 +20,7 @@ namespace { using namespace megdnn; template -void forward(const ElemwiseOpParamN<2> src, const AddUpdate::Param& param) { +void forward(const ElemwiseOpParamN<2>& src, const AddUpdate::Param& param) { T alpha(param.alpha), beta(param.beta), bias(param.bias); auto iter0 = tensor_iter_valonly(src[0]).begin(); @@ -40,11 +40,12 @@ namespace naive { void AddUpdateForwardImpl::exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) { check_exec(dest.layout, delta.layout); ElemwiseOpParamN<2> src = make_param(dest, delta); -#define cb(DType) \ - if (dest.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(forward(src, m_param)); \ - return; \ + auto param = m_param; +#define cb(DType) \ + if (dest.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(forward(src, param)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/argsort/opr_impl.cpp b/dnn/src/naive/argsort/opr_impl.cpp index 6e6155c5..b1e0df0c 100644 --- a/dnn/src/naive/argsort/opr_impl.cpp +++ b/dnn/src/naive/argsort/opr_impl.cpp @@ -65,16 +65,14 @@ void ArgsortForwardImpl::exec( _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, indices.layout, workspace.size); auto M = src.layout.shape[0], N = src.layout.shape[1]; - auto iptr = indices.ptr(); switch (src.layout.dtype.enumv()) { -#define cb(dt) \ - case DTypeTrait
::enumv: { \ - using ctype = DTypeTrait
::ctype; \ - auto sptr = src.ptr(); \ - auto dptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(forward_impl( \ - M, N, sptr, dptr, iptr, param().order == Order::ASCENDING)); \ - return; \ +#define cb(dt) \ + case DTypeTrait
::enumv: { \ + using ctype = DTypeTrait
::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(forward_impl( \ + M, N, src.ptr(), dst.ptr(), indices.ptr(), \ + param().order == Order::ASCENDING)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb @@ -89,15 +87,14 @@ void ArgsortBackwardImpl::exec( check_exec(diff.layout, indices.layout, grad.layout, workspace.size); size_t M = grad.layout.shape[0], N = grad.layout.shape[1], SRC_W = indices.layout[1]; - auto iptr = indices.ptr(); switch (diff.layout.dtype.enumv()) { -#define cb(dt) \ - case DTypeTrait
::enumv: { \ - using ctype = DTypeTrait
::ctype; \ - auto hptr = diff.ptr(); \ - auto gptr = grad.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(backward_impl(M, N, SRC_W, gptr, hptr, iptr)); \ - return; \ +#define cb(dt) \ + case DTypeTrait
::enumv: { \ + using ctype = DTypeTrait
::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(backward_impl( \ + M, N, SRC_W, grad.ptr(), diff.ptr(), \ + indices.ptr())); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/batch_normalization/opr_impl.cpp b/dnn/src/naive/batch_normalization/opr_impl.cpp index d503342a..f84031b6 100644 --- a/dnn/src/naive/batch_normalization/opr_impl.cpp +++ b/dnn/src/naive/batch_normalization/opr_impl.cpp @@ -77,8 +77,7 @@ void bn_forward_exec( if (param.fwd_mode == param::BN::FwdMode::TRAINING) { // Calculate the means of this batch (Mu) - memset(batch_mean.raw_ptr, 0, - batch_mean.layout.total_nr_elems() * sizeof(float)); + memset(batch_mean_p, 0, batch_mean.layout.total_nr_elems() * sizeof(float)); rep_4d(src_shape, dim_offset) batch_mean_p[param_pos] += src_p[src_pos]; rep_4d_end @@ -91,7 +90,7 @@ void bn_forward_exec( } // Calculate the variances of this batch (Sigma) - memset(batch_inv_variance.raw_ptr, 0, + memset(batch_inv_variance_p, 0, batch_inv_variance.layout.total_nr_elems() * sizeof(float)); rep_4d(src_shape, dim_offset) sigma_p = src_p[src_pos] - batch_mean_p[param_pos]; diff --git a/dnn/src/naive/batched_matrix_mul/opr_impl.cpp b/dnn/src/naive/batched_matrix_mul/opr_impl.cpp index 72eadfa8..597154ab 100644 --- a/dnn/src/naive/batched_matrix_mul/opr_impl.cpp +++ b/dnn/src/naive/batched_matrix_mul/opr_impl.cpp @@ -34,35 +34,37 @@ void BatchedMatrixMulForwardImpl::exec( m_opr->param() = this->param(); auto N = A.layout.shape[0]; - TensorND A_, B_, C_; - A_.raw_ptr = A.raw_ptr; - A_.layout = A.layout.remove_axis(0); - B_.raw_ptr = B.raw_ptr; - B_.layout = B.layout.remove_axis(0); - C_.raw_ptr = C.raw_ptr; - C_.layout = C.layout.remove_axis(0); auto Astrd = A.layout.dtype.size() * A.layout.stride[0], Bstrd = B.layout.dtype.size() * B.layout.stride[0], Cstrd = C.layout.dtype.size() * C.layout.stride[0]; - auto advance_ptr = [](TensorND& dest, ptrdiff_t d) { - dest.raw_ptr = static_cast(static_cast(dest.raw_ptr) + d); - }; + auto Aref = A.get_ref_ptr(); + auto Bref = B.get_ref_ptr(); + auto Cref = C.get_ref_ptr(); rep(n, N) { + //! all tensors should share the same RefPtr + auto A_ref = Aref; + A_ref += n * Astrd; + auto B_ref = Bref; + B_ref += n * Bstrd; + auto C_ref = Cref; + C_ref += n * Cstrd; + TensorND A_{A.layout.remove_axis(0), A_ref}; + TensorND B_{B.layout.remove_axis(0), B_ref}; + TensorND C_{C.layout.remove_axis(0), C_ref}; m_opr->exec(A_, B_, C_, workspace); - advance_ptr(A_, Astrd); - advance_ptr(B_, Bstrd); - advance_ptr(C_, Cstrd); } } + std::vector BatchedMatrixMulForwardImpl:: get_all_algorithms( const TensorLayout& /*A*/, const TensorLayout& /*B*/, const TensorLayout& /*C*/) { return {static_cast(handle())->default_batched_matmul_fwd_algo()}; } + std::vector BatchedMatrixMulForwardImpl:: get_all_algorithms_safe( const TensorLayout& /*A*/, const TensorLayout& /*B*/, diff --git a/dnn/src/naive/checksum/opr_impl.cpp b/dnn/src/naive/checksum/opr_impl.cpp index 6dcb9ffa..bba801ed 100644 --- a/dnn/src/naive/checksum/opr_impl.cpp +++ b/dnn/src/naive/checksum/opr_impl.cpp @@ -30,13 +30,13 @@ ChecksumForward::Result ChecksumForwardImpl::exec( Result result; bool finished = false; auto run = [&]() { - auto ptr = static_cast(data.raw_ptr); + auto ptr = static_cast(data.raw_ptr()); size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t); result.last_val.iv = 0; auto last_val_size = std::min(size_all, 4); memcpy(&result.last_val, ptr + size_all - last_val_size, last_val_size); result.checksum = 0; - auto iptr = static_cast(data.raw_ptr); + auto iptr = static_cast(data.raw_ptr()); for (size_t i = 0; i < size_ints; ++i) result.checksum += iptr[i] * (i + 1); diff --git a/dnn/src/naive/concat/concat.cpp b/dnn/src/naive/concat/opr_impl.cpp similarity index 98% rename from dnn/src/naive/concat/concat.cpp rename to dnn/src/naive/concat/opr_impl.cpp index 3c170bd9..468d9ec5 100644 --- a/dnn/src/naive/concat/concat.cpp +++ b/dnn/src/naive/concat/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/concat/concat.cpp + * \file dnn/src/naive/concat/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. diff --git a/dnn/src/naive/cond_take/opr_impl.cpp b/dnn/src/naive/cond_take/opr_impl.cpp index e12c2f82..db67fd0d 100644 --- a/dnn/src/naive/cond_take/opr_impl.cpp +++ b/dnn/src/naive/cond_take/opr_impl.cpp @@ -24,9 +24,10 @@ namespace { template void gen_index( - size_t sz, dt_int32* dest, const ctype* inp, + size_t sz, dt_int32* dest, const TensorND& mask, cond_take::Pred pred) { int didx = 0; + auto inp = mask.ptr(); for (size_t i = 0; i < sz; ++i) { if (pred(inp[i])) { dest[didx++] = i; @@ -59,11 +60,11 @@ CondTakeImpl::Output CondTakeImpl::exec( auto idx_tmp = workspace.ptr(); switch (mask.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - dispatch_genidx(size, idx_tmp, mask.ptr()); \ - break; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + dispatch_genidx(size, idx_tmp, mask); \ + break; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) cb(::megdnn::dtype::Bool) @@ -75,17 +76,15 @@ CondTakeImpl::Output CondTakeImpl::exec( size_t out_size = idx_tmp[size]; auto out_data = malloc_policy.alloc_output(0, data.layout.dtype, {out_size}); auto out_idx = malloc_policy.alloc_output(1, dtype::Int32(), {out_size}); - auto out_idx_ptr = out_idx.ptr(); switch (data.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - auto out_data_ptr = out_data.ptr(); \ - auto data_ptr = data.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(copy_data( \ - out_size, out_idx_ptr, out_data_ptr, idx_tmp, data_ptr)); \ - break; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(copy_data( \ + out_size, out_idx.ptr(), out_data.ptr(), idx_tmp, \ + data.ptr())); \ + break; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) cb(::megdnn::dtype::Bool) @@ -97,14 +96,14 @@ CondTakeImpl::Output CondTakeImpl::exec( } template -void CondTakeImpl::dispatch_genidx(size_t size, dt_int32* dest, const ctype* inp) { +void CondTakeImpl::dispatch_genidx(size_t size, dt_int32* dest, const TensorND& mask) { KParam kparam(m_param); switch (m_param.mode) { -#define cb(_m) \ - case Param::Mode::_m: { \ - Pred pred(kparam); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(gen_index(size, dest, inp, pred)); \ - return; \ +#define cb(_m) \ + case Param::Mode::_m: { \ + Pred pred(kparam); \ + MEGDNN_DISPATCH_CPU_KERN_OPR(gen_index(size, dest, mask, pred)); \ + return; \ } MEGDNN_FOREACH_COND_TAKE_MODE(cb) #undef cb diff --git a/dnn/src/naive/cond_take/opr_impl.h b/dnn/src/naive/cond_take/opr_impl.h index b31c6e74..937be1ab 100644 --- a/dnn/src/naive/cond_take/opr_impl.h +++ b/dnn/src/naive/cond_take/opr_impl.h @@ -16,7 +16,7 @@ namespace naive { class CondTakeImpl : public CondTake { template - void dispatch_genidx(size_t size, dt_int32* dest, const ctype* inp); + void dispatch_genidx(size_t size, dt_int32* dest, const TensorND& mask); public: using CondTake::CondTake; diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index dd189620..25050c8f 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -92,13 +92,13 @@ void handle_z_inp_and_activation_naive( auto nonlinear = handle->create_operator(); nonlinear->param().mode = Elemwise::Param::Mode::SIGMOID; nonlinear->exec({res}, res); - if (res.raw_ptr != dst_tensor.raw_ptr) { + if (res.raw_ptr() != dst_tensor.raw_ptr()) { handle->create_operator()->exec(res, dst_tensor); } break; } case NonlineMode::IDENTITY: { - if (res.raw_ptr != dst_tensor.raw_ptr) { + if (res.raw_ptr() != dst_tensor.raw_ptr()) { handle->create_operator()->exec(res, dst_tensor); } break; diff --git a/dnn/src/naive/convolution/helper.h b/dnn/src/naive/convolution/helper.h index c190e5cb..6693110f 100644 --- a/dnn/src/naive/convolution/helper.h +++ b/dnn/src/naive/convolution/helper.h @@ -672,7 +672,7 @@ template void backward_data( _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, const Convolution::CanonizedFilterMeta& filter_meta) { - memset(grad.raw_ptr, 0, grad.layout.span().dist_byte()); + memset(grad.raw_ptr(), 0, grad.layout.span().dist_byte()); megdnn_assert(filter_meta.spatial_ndim == 2); if (filter_meta.format == param::Convolution::Format::NHWCD4) { return compute2d_hwcd4( @@ -686,7 +686,7 @@ template void backward_filter( _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, const Convolution::CanonizedFilterMeta& filter_meta) { - memset(grad.raw_ptr, 0, grad.layout.span().dist_byte()); + memset(grad.raw_ptr(), 0, grad.layout.span().dist_byte()); megdnn_assert(filter_meta.spatial_ndim == 2); compute2d( src, grad.compatible_ptr(), diff, filter_meta); diff --git a/dnn/src/naive/convolution/convolution.cpp b/dnn/src/naive/convolution/opr_impl.cpp similarity index 95% rename from dnn/src/naive/convolution/convolution.cpp rename to dnn/src/naive/convolution/opr_impl.cpp index 05ea8992..97efdb67 100644 --- a/dnn/src/naive/convolution/convolution.cpp +++ b/dnn/src/naive/convolution/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/convolution/convolution.cpp + * \file dnn/src/naive/convolution/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -8,8 +8,8 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include "./helper.h" #include "./opr_impl.h" +#include "./helper.h" #include "megdnn/dtype.h" #include "megdnn/heuristic_cache.h" @@ -125,10 +125,8 @@ void ConvolutionBackwardDataImpl::exec( #undef cb #if !MEGDNN_DISABLE_FLOAT16 if (filter.layout.dtype == dtype::Float16() && cmode == ComputeMode::FLOAT32) { - TensorND grad_fp32; - grad_fp32.layout = grad.layout; - grad_fp32.layout.dtype = dtype::Float32(); - grad_fp32.raw_ptr = workspace.raw_ptr; + TensorND grad_fp32{ + workspace.raw_ptr, TensorLayout{grad.layout, dtype::Float32()}}; auto&& type_cvt = handle()->create_operator(); type_cvt->exec(grad, grad_fp32); MEGDNN_DISPATCH_CPU_KERN_OPR( @@ -138,10 +136,8 @@ void ConvolutionBackwardDataImpl::exec( return; } if (filter.layout.dtype == dtype::BFloat16() && cmode == ComputeMode::FLOAT32) { - TensorND grad_fp32; - grad_fp32.layout = grad.layout; - grad_fp32.layout.dtype = dtype::Float32(); - grad_fp32.raw_ptr = workspace.raw_ptr; + TensorND grad_fp32{ + workspace.raw_ptr, TensorLayout{grad.layout, dtype::Float32()}}; auto&& type_cvt = handle()->create_operator(); type_cvt->exec(grad, grad_fp32); MEGDNN_DISPATCH_CPU_KERN_OPR( @@ -235,10 +231,8 @@ void ConvolutionBackwardFilterImpl::exec( #undef cb #if !MEGDNN_DISABLE_FLOAT16 if (src.layout.dtype == dtype::Float16() && cmode == ComputeMode::FLOAT32) { - TensorND grad_fp32; - grad_fp32.layout = grad.layout; - grad_fp32.layout.dtype = dtype::Float32(); - grad_fp32.raw_ptr = workspace.raw_ptr; + TensorND grad_fp32{ + workspace.raw_ptr, TensorLayout{grad.layout, dtype::Float32()}}; auto&& type_cvt = handle()->create_operator(); type_cvt->exec(grad, grad_fp32); MEGDNN_DISPATCH_CPU_KERN_OPR( @@ -248,10 +242,8 @@ void ConvolutionBackwardFilterImpl::exec( return; } if (src.layout.dtype == dtype::BFloat16() && cmode == ComputeMode::FLOAT32) { - TensorND grad_fp32; - grad_fp32.layout = grad.layout; - grad_fp32.layout.dtype = dtype::Float32(); - grad_fp32.raw_ptr = workspace.raw_ptr; + TensorND grad_fp32{ + workspace.raw_ptr, TensorLayout{grad.layout, dtype::Float32()}}; auto&& type_cvt = handle()->create_operator(); type_cvt->exec(grad, grad_fp32); MEGDNN_DISPATCH_CPU_KERN_OPR( diff --git a/dnn/src/naive/convolution3d/helper.h b/dnn/src/naive/convolution3d/helper.h index b770ae3e..63b43c96 100644 --- a/dnn/src/naive/convolution3d/helper.h +++ b/dnn/src/naive/convolution3d/helper.h @@ -192,7 +192,7 @@ template void backward_data( _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, const Convolution3D::CanonizedFilterMeta& filter_meta) { - memset(grad.raw_ptr, 0, grad.layout.span().dist_byte()); + memset(grad.raw_ptr(), 0, grad.layout.span().dist_byte()); megdnn_assert(filter_meta.spatial_ndim == 3); compute3d( grad, filter.ptr(), diff, filter_meta); @@ -202,7 +202,7 @@ template void backward_filter( _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, const Convolution3D::CanonizedFilterMeta& filter_meta) { - memset(grad.raw_ptr, 0, grad.layout.span().dist_byte()); + memset(grad.raw_ptr(), 0, grad.layout.span().dist_byte()); megdnn_assert(filter_meta.spatial_ndim == 3); compute3d( src, grad.ptr(), diff, filter_meta); diff --git a/dnn/src/naive/convolution3d/convolution3d.cpp b/dnn/src/naive/convolution3d/opr_impl.cpp similarity index 99% rename from dnn/src/naive/convolution3d/convolution3d.cpp rename to dnn/src/naive/convolution3d/opr_impl.cpp index af1b104f..5b4c84db 100644 --- a/dnn/src/naive/convolution3d/convolution3d.cpp +++ b/dnn/src/naive/convolution3d/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/convolution3d/convolution3d.cpp + * \file dnn/src/naive/convolution3d/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -9,8 +9,8 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ -#include "./helper.h" #include "./opr_impl.h" +#include "./helper.h" #include "megdnn/dtype.h" #include "src/common/utils.h" diff --git a/dnn/src/naive/convpooling/conv_pooling.cpp b/dnn/src/naive/convpooling/opr_impl.cpp similarity index 97% rename from dnn/src/naive/convpooling/conv_pooling.cpp rename to dnn/src/naive/convpooling/opr_impl.cpp index ec9b70a3..15bfb6ad 100644 --- a/dnn/src/naive/convpooling/conv_pooling.cpp +++ b/dnn/src/naive/convpooling/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/convpooling/conv_pooling.cpp + * \file dnn/src/naive/convpooling/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -85,7 +85,7 @@ void ConvPoolingForwardImpl::exec( const _megdnn_in TensorND bias, _megdnn_out TensorND dst, _megdnn_out Workspace workspace) { Workspace empty_wsp; - TensorND conv_dst((float*)(workspace.raw_ptr), conv_dst_layout); + TensorND conv_dst{workspace.raw_ptr, conv_dst_layout}; // convFwd->check_layout(src.layout, filter.layout, workspace.layout, // empty_wsp.layout); check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size); diff --git a/dnn/src/naive/cumsum/opr_impl.cpp b/dnn/src/naive/cumsum/opr_impl.cpp index 16329314..bec628f6 100644 --- a/dnn/src/naive/cumsum/opr_impl.cpp +++ b/dnn/src/naive/cumsum/opr_impl.cpp @@ -63,13 +63,13 @@ void CumsumForwardImpl::exec( size_t A, B, C; reduce::get_ABC(src.layout, A, B, C, param().axis); -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using ctype = DTypeTrait::ctype; \ - ctype *sptr = src.ptr(), *dptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal( \ - sptr, dptr, A, B, C, param().exclusive, param().reverse)); \ - return; \ +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal( \ + src.ptr(), dst.ptr(), A, B, C, param().exclusive, \ + param().reverse)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) megdnn_assert_internal(0); diff --git a/dnn/src/naive/dct/opr_impl.cpp b/dnn/src/naive/dct/opr_impl.cpp index 60110413..962ef5ab 100644 --- a/dnn/src/naive/dct/opr_impl.cpp +++ b/dnn/src/naive/dct/opr_impl.cpp @@ -206,7 +206,7 @@ void DctChannelSelectForwardImpl::exec( int ic = src.layout.shape[1]; int ih = src.layout.shape[2]; int iw = src.layout.shape[3]; - megdnn_assert(dst.raw_ptr, "dst can not be nullptr"); + megdnn_assert(dst.raw_ptr(), "dst can not be nullptr"); const int block = param().dct_block_size; auto mask = mask_offset_to_2dmask(mask_offset, mask_val); if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { @@ -226,7 +226,7 @@ void DctChannelSelectForwardImpl::exec( param().format == Param::Format::NCHW4, "dst must be nchw4"); MEGDNN_DISPATCH_CPU_KERN_OPR(naive_dct( - src.ptr(), static_cast(dst.raw_ptr), in, ic, ih, + src.ptr(), static_cast(dst.raw_ptr()), in, ic, ih, iw, block, mask, dst.layout.dtype)); } } diff --git a/dnn/src/naive/deformable_conv/opr_impl.cpp b/dnn/src/naive/deformable_conv/opr_impl.cpp index c76884d7..fa48979a 100644 --- a/dnn/src/naive/deformable_conv/opr_impl.cpp +++ b/dnn/src/naive/deformable_conv/opr_impl.cpp @@ -127,15 +127,10 @@ void Fwd::exec( FW = filter_meta.spatial[1], OC = filter_meta.group * filter_meta.ocpg, OH = out.layout[2], OW = out.layout[3]; - const float* __restrict im_ptr = im.ptr(); - const float* __restrict filter_ptr = filter.ptr(); - const float* __restrict offset_ptr = offset.ptr(); - const float* __restrict mask_ptr = mask.ptr(); - float* __restrict dst_ptr = dst.ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(deformable_conv_forward( - im_ptr, filter_ptr, offset_ptr, mask_ptr, dst_ptr, OC, IC, N, FH, FW, IH, - IW, PH, PW, DH, DW, SH, SW, OH, OW, group, deformable_group)); + im.ptr(), filter.ptr(), offset.ptr(), + mask.ptr(), dst.ptr(), OC, IC, N, FH, FW, IH, IW, PH, PW, DH, + DW, SH, SW, OH, OW, group, deformable_group)); return; } @@ -398,15 +393,11 @@ void BwdFlt::exec( FH = fm.spatial[0], FW = fm.spatial[1], OC = fm.group * fm.ocpg, OH = out.layout[2], OW = out.layout[3]; - const float* __restrict im_ptr = im.ptr(); - const float* __restrict offset_ptr = offset.ptr(); - const float* __restrict mask_ptr = mask.ptr(); - const float* __restrict out_grad_ptr = out_grad.ptr(); - float* __restrict filter_grad_ptr = filter_grad.ptr(); // backward filter MEGDNN_DISPATCH_CPU_KERN_OPR(deformable_conv_backward_weight( - im_ptr, offset_ptr, mask_ptr, out_grad_ptr, filter_grad_ptr, OC, IC, N, FH, - FW, IH, IW, PH, PW, DH, DW, SH, SW, OH, OW, group, deformable_group)); + im.ptr(), offset.ptr(), mask.ptr(), + out_grad.ptr(), filter_grad.ptr(), OC, IC, N, FH, FW, IH, IW, + PH, PW, DH, DW, SH, SW, OH, OW, group, deformable_group)); } size_t BwdData::get_workspace_in_bytes( const TensorLayout& /* im */, const TensorLayout& /* filter */, @@ -429,21 +420,12 @@ void BwdData::exec( FH = fm.spatial[0], FW = fm.spatial[1], OC = fm.group * fm.ocpg, OH = out_grad.layout[2], OW = out_grad.layout[3]; - const float* __restrict im_ptr = im.ptr(); - const float* __restrict filter_ptr = filter.ptr(); - const float* __restrict offset_ptr = offset.ptr(); - const float* __restrict mask_ptr = mask.ptr(); - const float* __restrict out_grad_ptr = out_grad.ptr(); - - float* __restrict im_grad_ptr = im_grad.ptr(); - float* __restrict offset_grad_ptr = offset_grad.ptr(); - float* __restrict mask_grad_ptr = mask_grad.ptr(); - // backward coordinate data MEGDNN_DISPATCH_CPU_KERN_OPR(deformable_conv_backward_data( - im_ptr, filter_ptr, offset_ptr, mask_ptr, out_grad_ptr, im_grad_ptr, - offset_grad_ptr, mask_grad_ptr, OC, IC, N, FH, FW, IH, IW, PH, PW, SH, SW, - DH, DW, OH, OW, group, deformable_group)); + im.ptr(), filter.ptr(), offset.ptr(), + mask.ptr(), out_grad.ptr(), im_grad.ptr(), + offset_grad.ptr(), mask_grad.ptr(), OC, IC, N, FH, FW, IH, IW, + PH, PW, SH, SW, DH, DW, OH, OW, group, deformable_group)); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/elemwise/opr_impl.cpp b/dnn/src/naive/elemwise/opr_impl.cpp index 392f275c..42b63203 100644 --- a/dnn/src/naive/elemwise/opr_impl.cpp +++ b/dnn/src/naive/elemwise/opr_impl.cpp @@ -53,13 +53,12 @@ void fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param) { } // anonymous namespace -#define on_arity_dispatched_cb_dtype(_dt) \ - if (m_dst->layout.dtype == _dt()) { \ - using dtrait = DTypeTrait<_dt>; \ - using ctype = dtrait::ctype; \ - return ModeDispatcher::run( \ - static_cast(handle()), src, m_param.mode, \ - m_dst->ptr()); \ +#define on_arity_dispatched_cb_dtype(_dt) \ + if (m_dst->layout.dtype == _dt()) { \ + using dtrait = DTypeTrait<_dt>; \ + using ctype = dtrait::ctype; \ + return ModeDispatcher::run( \ + static_cast(handle()), src, m_param.mode, *m_dst); \ } #define _cb_dispatch_mode(_m) \ @@ -70,9 +69,10 @@ void fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param) { MIDOUT_BEGIN( \ megdnn_naive_elemwise, \ midout_iv(param_enumv::Elemwise::Mode::_m)) { \ + auto params = src; \ MEGDNN_DISPATCH_CPU_KERN( \ handle, ElemArithKernCaller::run( \ - dst, src)); \ + dst.ptr(), params)); \ return; \ } \ MIDOUT_END(); \ @@ -84,7 +84,7 @@ void fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param) { static constexpr int arity = _arity; \ static void run( \ HandleImpl* handle, const ElemwiseOpParamN& src, Mode mode, \ - ctype* dst) { \ + const TensorND dst) { \ switch (mode) { \ FOREACH(_cb_dispatch_mode) \ default: \ @@ -97,14 +97,16 @@ void fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param) { template void ElemwiseForwardImpl::impl_fuse_mul_add3(const ElemwiseOpParamN<3>& params) { - auto dptr = m_dst->ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(fuse_mul_add3(dptr, params)); + auto dst = *m_dst; + auto elparam = params; + MEGDNN_DISPATCH_CPU_KERN_OPR(fuse_mul_add3(dst.ptr(), elparam)); } template void ElemwiseForwardImpl::impl_fuse_mul_add4(const ElemwiseOpParamN<4>& params) { - auto dptr = m_dst->ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(fuse_mul_add4(dptr, params)); + auto dst = *m_dst; + auto elparam = params; + MEGDNN_DISPATCH_CPU_KERN_OPR(fuse_mul_add4(dst.ptr(), elparam)); } } // namespace naive } // namespace megdnn diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl.cpp index 71582bcb..18769e0a 100644 --- a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/naive/elemwise_multi_type/opr_impl.cpp @@ -19,18 +19,18 @@ using namespace megdnn; using namespace naive; void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) { - auto iter0 = tensor_iter_valonly(param[0]).begin(); - auto iter1 = tensor_iter_valonly(param[1]).begin(); - auto iter2 = tensor_iter_valonly(param[2]).begin(); - + const ElemwiseOpParamN<3>& param, const TensorND& dst) { auto size = param.size; - auto work = [iter0, iter1, iter2, size, dst]() { - auto i0 = iter0; - auto i1 = iter1; - auto i2 = iter2; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); for (size_t i = 0; i < size; ++i) { - dst[i] = (*i0) * (*i1) + (*i2); + dst_ptr[i] = (*i0) * (*i1) + (*i2); ++i0; ++i1; ++i2; @@ -40,7 +40,7 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( } void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) { + const ElemwiseOpParamN<3>& param, const TensorND& dst) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ case DTypeTrait::enumv: \ @@ -54,19 +54,19 @@ void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( template void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) { - auto iter0 = tensor_iter_valonly(param[0]).begin(); - auto iter1 = tensor_iter_valonly(param[1]).begin(); - auto iter2 = tensor_iter_valonly(param[2]).begin(); - + const ElemwiseOpParamN<3>& param, const TensorND& dst) { auto size = param.size; - auto work = [iter0, iter1, iter2, size, dst]() { + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { elemwise_multi_type::Fma3iXxf32xf32xiYOp op; - auto i0 = iter0; - auto i1 = iter1; - auto i2 = iter2; + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); for (size_t i = 0; i < size; ++i) { - dst[i] = op(*i0, *i1, *i2); + dst_ptr[i] = op(*i0, *i1, *i2); ++i0; ++i1; ++i2; @@ -76,7 +76,7 @@ void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8( } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ case DTypeTrait::enumv: \ @@ -91,16 +91,15 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( template void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX( - const ElemwiseOpParamN<2>& param, dst_ctype* dst) { - auto iter_a = tensor_iter_valonly(param[0]).begin(); - auto iter_b = tensor_iter_valonly(param[1]).begin(); - + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + auto src0 = param[0]; + auto src1 = param[1]; auto size = param.size; - auto work = [size, iter_a, iter_b, dst]() { + auto work = [src0, src1, size, dst]() { // This is needed as these iterators are captured as const value. - auto iA = iter_a; - auto iB = iter_b; - auto pD = dst; + auto iA = tensor_iter_valonly(src0).begin(); + auto iB = tensor_iter_valonly(src1).begin(); + auto pD = dst.ptr(); for (size_t i = 0; i < size; i++) { *pD = elemwise_multi_type::round_shr_saturate(*iA, *iB); ++iA; @@ -113,28 +112,28 @@ void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX( template void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { - auto iter0 = tensor_iter_valonly(param[0]).begin(); - auto iter1 = tensor_iter_valonly(param[1]).begin(); - auto iter2 = tensor_iter_valonly(param[2]).begin(); - auto iter3 = tensor_iter_valonly(param[3]).begin(); - auto iter4 = tensor_iter_valonly(param[4]).begin(); - auto iter5 = tensor_iter_valonly(param[5]).begin(); - + const ElemwiseOpParamN<6>& param, const TensorND& dst) { auto size = param.size; - auto work = [iter0, iter1, iter2, iter3, iter4, iter5, size, dst]() { - auto i0 = iter0; - auto i1 = iter1; - auto i2 = iter2; - auto ioff = iter3; - auto imin = iter4; - auto imax = iter5; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto src3 = param[3]; + auto src4 = param[4]; + auto src5 = param[5]; + auto work = [size, src0, src1, src2, src3, src4, src5, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto ioff = tensor_iter_valonly(src3).begin(); + auto imin = tensor_iter_valonly(src4).begin(); + auto imax = tensor_iter_valonly(src5).begin(); + auto dst_ptr = dst.ptr(); for (size_t i = 0; i < size; ++i) { auto res = elemwise_multi_type::round_shr_saturate( round_mulh_saturate(*i0 + *i1, *i2), *ioff); res = std::min(res, *imax); res = std::max(res, *imin); - dst[i] = res; + dst_ptr[i] = res; ++i0; ++i1; ++i2; @@ -147,17 +146,17 @@ void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate( } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); } void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst) { + const ElemwiseOpParamN<6>& param, const TensorND& dst) { dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); } void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) { + const ElemwiseOpParamN<2>& param, const TensorND& dst) { switch (param[0].layout.dtype.enumv()) { #define cb(t) \ case DTypeTrait::enumv: \ @@ -174,16 +173,15 @@ void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( template void ElemwiseMultiTypeImpl::dispatch_add_qint_op( const ElemwiseOpParamN<1>& param, const TensorND& dst_tensor) { - auto iter_a = tensor_iter_valonly(param[0]).begin(); + auto src = param[0]; auto size = param.size; - auto param0 = param[0].layout.dtype.param::dtype>(); - auto dst = tensor_iter_valonly(dst_tensor).begin(); - auto dst_param = - dst_tensor.layout.dtype.param::dtype>(); + auto work = [src, size, dst_tensor]() { + auto iA = tensor_iter_valonly(src).begin(); + auto pD = tensor_iter_valonly(dst_tensor).begin(); - auto work = [size, iter_a, dst, param0, dst_param]() { - auto iA = iter_a; - auto pD = dst; + auto param0 = src.layout.dtype.param::dtype>(); + auto dst_param = + dst_tensor.layout.dtype.param::dtype>(); for (size_t i = 0; i < size; i++) { src_ctype a = *iA; *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(a))); @@ -197,20 +195,18 @@ void ElemwiseMultiTypeImpl::dispatch_add_qint_op( template void ElemwiseMultiTypeImpl::dispatch_add_qint_op( const ElemwiseOpParamN<2>& param, const TensorND& dst_tensor) { - auto iter_a = tensor_iter_valonly(param[0]).begin(); - auto iter_b = tensor_iter_valonly(param[1]).begin(); auto size = param.size; - auto param0 = param[0].layout.dtype.param::dtype>(); - auto param1 = param[1].layout.dtype.param::dtype>(); - auto dst = tensor_iter_valonly(dst_tensor).begin(); - auto dst_param = - dst_tensor.layout.dtype.param::dtype>(); - - auto work = [size, iter_a, iter_b, dst, param0, param1, dst_param]() { + auto src0 = param[0]; + auto src1 = param[1]; + auto work = [src0, src1, size, dst_tensor]() { // This is needed as these iterators are captured as const value. - auto iA = iter_a; - auto iB = iter_b; - auto pD = dst; + auto iA = tensor_iter_valonly(src0).begin(); + auto iB = tensor_iter_valonly(src1).begin(); + auto pD = tensor_iter_valonly(dst_tensor).begin(); + auto param0 = src0.layout.dtype.param::dtype>(); + auto param1 = src1.layout.dtype.param::dtype>(); + auto dst_param = + dst_tensor.layout.dtype.param::dtype>(); for (size_t i = 0; i < size; i++) { src_ctype a = *iA; src_ctype b = *iB; @@ -227,24 +223,21 @@ void ElemwiseMultiTypeImpl::dispatch_add_qint_op( template void ElemwiseMultiTypeImpl::dispatch_add_qint_op( const ElemwiseOpParamN<3>& param, const TensorND& dst_tensor) { - auto iter_a = tensor_iter_valonly(param[0]).begin(); - auto iter_b = tensor_iter_valonly(param[1]).begin(); - auto iter_c = tensor_iter_valonly(param[2]).begin(); auto size = param.size; - auto param0 = param[0].layout.dtype.param::dtype>(); - auto param1 = param[1].layout.dtype.param::dtype>(); - auto param2 = param[2].layout.dtype.param::dtype>(); - auto dst = tensor_iter_valonly(dst_tensor).begin(); - auto dst_param = - dst_tensor.layout.dtype.param::dtype>(); - - auto work = [size, iter_a, iter_b, iter_c, dst, param0, param1, param2, - dst_param]() { + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst_tensor]() { // This is needed as these iterators are captured as const value. - auto iA = iter_a; - auto iB = iter_b; - auto iC = iter_c; - auto pD = dst; + auto iA = tensor_iter_valonly(src0).begin(); + auto iB = tensor_iter_valonly(src1).begin(); + auto iC = tensor_iter_valonly(src2).begin(); + auto pD = tensor_iter_valonly(dst_tensor).begin(); + auto param0 = src0.layout.dtype.param::dtype>(); + auto param1 = src1.layout.dtype.param::dtype>(); + auto param2 = src2.layout.dtype.param::dtype>(); + auto dst_param = + dst_tensor.layout.dtype.param::dtype>(); for (size_t i = 0; i < size; i++) { src_ctype a = *iA; src_ctype b = *iB; diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl.h b/dnn/src/naive/elemwise_multi_type/opr_impl.h index 96f89b0a..60dcaed1 100644 --- a/dnn/src/naive/elemwise_multi_type/opr_impl.h +++ b/dnn/src/naive/elemwise_multi_type/opr_impl.h @@ -37,28 +37,29 @@ class ElemwiseMultiTypeImpl : public ElemwiseMultiTypeImplHelper { protected: template - void dispatch_fma3_iXxf32xf32xi8(const ElemwiseOpParamN<3>& param, dt_int8* dst); + void dispatch_fma3_iXxf32xf32xi8( + const ElemwiseOpParamN<3>& param, const TensorND& dst); template void dispatch_round_shr_saturate_iXxi8xiX( - const ElemwiseOpParamN<2>& param, dst_ctype* dst); + const ElemwiseOpParamN<2>& param, const TensorND& dst); template void dispatch_fuse_add_rmulh_round_shr_saturate( - const ElemwiseOpParamN<6>& param, megdnn::dt_int8* dst); + const ElemwiseOpParamN<6>& param, const TensorND& dst); void on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, dt_int32* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, dt_int8* dst) override; + const ElemwiseOpParamN<3>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, dt_int8* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + const ElemwiseOpParamN<6>& param, const TensorND& dst) override; void on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, dt_int16* dst) override; + const ElemwiseOpParamN<2>& param, const TensorND& dst) override; void on_quantized_mode( const ElemwiseOpParamN<1>& param, const TensorND& dst, diff --git a/dnn/src/naive/eye/opr_impl.cpp b/dnn/src/naive/eye/opr_impl.cpp index 5e571d26..3e6dd1a9 100644 --- a/dnn/src/naive/eye/opr_impl.cpp +++ b/dnn/src/naive/eye/opr_impl.cpp @@ -25,11 +25,10 @@ void EyeImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) { std::max(dst.layout.shape[0], dst.layout.shape[1]) < static_cast(std::numeric_limits::max())); int m = dst.layout.shape[0], n = dst.layout.shape[1]; -#define cb(DType) \ - if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - ctype* ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(ptr, m, n)); \ +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(dst.ptr(), m, n)); \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/fill/opr_impl.cpp b/dnn/src/naive/fill/opr_impl.cpp index a8955b94..b04c49de 100644 --- a/dnn/src/naive/fill/opr_impl.cpp +++ b/dnn/src/naive/fill/opr_impl.cpp @@ -22,11 +22,10 @@ namespace naive { void FillImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(dst.layout, workspace.size); size_t size = dst.layout.total_nr_elems(); -#define cb(DType) \ - if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - ctype* ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(ptr, size)); \ +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(dst.ptr(), size)); \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/group_local/opr_impl.cpp b/dnn/src/naive/group_local/opr_impl.cpp index 7793925e..7810ba38 100644 --- a/dnn/src/naive/group_local/opr_impl.cpp +++ b/dnn/src/naive/group_local/opr_impl.cpp @@ -142,13 +142,17 @@ void GroupLocalForwardImpl::exec( auto group = filter.layout.shape[0]; auto FH = filter.layout.shape[4], FW = filter.layout.shape[5]; auto OC = dst.layout.shape[1], OH = dst.layout.shape[2], OW = dst.layout.shape[3]; + auto pad_h = param().pad_h; + auto pad_w = param().pad_w; + auto stride_h = param().stride_h; + auto stride_w = param().stride_w; if (src.layout.dtype == dtype::Float32() && filter.layout.dtype == dtype::Float32() && dst.layout.dtype == dtype::Float32()) { - MEGDNN_DISPATCH_CPU_KERN_OPR(forward( - src.ptr(), filter.ptr(), dst.ptr(), - N, IC, IH, IW, FH, FW, OC, OH, OW, group, param().pad_h, param().pad_w, - param().stride_h, param().stride_w)); + MEGDNN_DISPATCH_CPU_KERN_OPR( + forward(src.ptr(), filter.ptr(), + dst.ptr(), N, IC, IH, IW, FH, FW, OC, OH, OW, group, + pad_h, pad_w, stride_h, stride_w)); } else if (DNN_FLOAT16_SELECT( src.layout.dtype == dtype::Float16() && filter.layout.dtype == dtype::Float16() && @@ -156,8 +160,8 @@ void GroupLocalForwardImpl::exec( false)) { DNN_INC_FLOAT16(MEGDNN_DISPATCH_CPU_KERN_OPR(forward( src.ptr(), filter.ptr(), dst.ptr(), - N, IC, IH, IW, FH, FW, OC, OH, OW, group, param().pad_h, param().pad_w, - param().stride_h, param().stride_w));); + N, IC, IH, IW, FH, FW, OC, OH, OW, group, pad_h, pad_w, stride_h, + stride_w));); } else { megdnn_assert_internal(false); @@ -174,10 +178,13 @@ void GroupLocalBackwardDataImpl::exec( auto FH = filter.layout.shape[4], FW = filter.layout.shape[5]; auto OC = diff.layout.shape[1], OH = diff.layout.shape[2], OW = diff.layout.shape[3]; + auto pad_h = param().pad_h; + auto pad_w = param().pad_w; + auto stride_h = param().stride_h; + auto stride_w = param().stride_w; MEGDNN_DISPATCH_CPU_KERN_OPR(backward_data( filter.ptr(), diff.ptr(), grad.ptr(), N, - IC, IH, IW, FH, FW, OC, OH, OW, group, param().pad_h, param().pad_w, - param().stride_h, param().stride_w)); + IC, IH, IW, FH, FW, OC, OH, OW, group, pad_h, pad_w, stride_h, stride_w)); } void GroupLocalBackwardFilterImpl::exec( @@ -190,10 +197,13 @@ void GroupLocalBackwardFilterImpl::exec( auto FH = grad.layout.shape[4], FW = grad.layout.shape[5]; auto OC = diff.layout.shape[1], OH = diff.layout.shape[2], OW = diff.layout.shape[3]; + auto pad_h = param().pad_h; + auto pad_w = param().pad_w; + auto stride_h = param().stride_h; + auto stride_w = param().stride_w; MEGDNN_DISPATCH_CPU_KERN_OPR(backward_filter( src.ptr(), diff.ptr(), grad.ptr(), N, - IC, IH, IW, FH, FW, OC, OH, OW, group, param().pad_h, param().pad_w, - param().stride_h, param().stride_w)); + IC, IH, IW, FH, FW, OC, OH, OW, group, pad_h, pad_w, stride_h, stride_w)); } } // namespace naive diff --git a/dnn/src/naive/linspace/opr_impl.cpp b/dnn/src/naive/linspace/opr_impl.cpp index af8ee2c2..1d9ebd69 100644 --- a/dnn/src/naive/linspace/opr_impl.cpp +++ b/dnn/src/naive/linspace/opr_impl.cpp @@ -19,12 +19,11 @@ namespace naive { void LinspaceImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(dst.layout, workspace.size); size_t n = dst.layout.total_nr_elems(); -#define cb(DType) \ - if (dst.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - auto ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(ptr, n)); \ - return; \ +#define cb(DType) \ + if (dst.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(dst.ptr(), n)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) megdnn_assert_internal(0); diff --git a/dnn/src/naive/local/local.cpp b/dnn/src/naive/local/opr_impl.cpp similarity index 95% rename from dnn/src/naive/local/local.cpp rename to dnn/src/naive/local/opr_impl.cpp index efe6bf76..21c8da1c 100644 --- a/dnn/src/naive/local/local.cpp +++ b/dnn/src/naive/local/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/local/local.cpp + * \file dnn/src/naive/local/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -82,7 +82,7 @@ void LocalForwardImpl::exec( LocalForwardImpl::FloatNoncontigBatchKernParam LocalForwardImpl::make_float_kern_param( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, _megdnn_workspace workspace) const { - return {src.raw_ptr, filter.raw_ptr, dst.raw_ptr, + return {src.get_ref_ptr(), filter.get_ref_ptr(), dst.get_ref_ptr(), // n src.layout.shape[0], // ic, ih, iw, oc, oh, ow, fh, fw @@ -116,10 +116,10 @@ void LocalBackwardDataImpl::exec( OW = diff.layout.shape[3]; size_t ph = param().pad_h, pw = param().pad_w; size_t sh = param().stride_h, sw = param().stride_w; - auto gptr = grad.ptr(), fptr = filter.ptr(), - hptr = diff.ptr(); auto mode = param().mode; auto kern = [=]() { + auto gptr = grad.ptr(), fptr = filter.ptr(), + hptr = diff.ptr(); memset(gptr, 0, sizeof(float_t) * N * IC * IH * IW); rep(n, N) rep(oc, OC) rep(oh, OH) rep(ow, OW) { // auto &hval = hptr[n*OC*OH*OW + oc*OH*OW + oh*OW + ow]; @@ -164,10 +164,10 @@ void LocalBackwardFilterImpl::exec( OW = diff.layout.shape[3]; size_t ph = param().pad_h, pw = param().pad_w; size_t sh = param().stride_h, sw = param().stride_w; - auto gptr = grad.ptr(), sptr = src.ptr(), - hptr = diff.ptr(); auto mode = param().mode; auto kern = [=]() { + auto gptr = grad.ptr(), sptr = src.ptr(), + hptr = diff.ptr(); memset(gptr, 0, sizeof(float_t) * OH * OW * IC * FH * FW * OC); rep(n, N) rep(oc, OC) rep(oh, OH) rep(ow, OW) { // auto &hval = hptr[n*OC*OH*OW + oc*OH*OW + oh*OW + ow]; diff --git a/dnn/src/naive/local/opr_impl.h b/dnn/src/naive/local/opr_impl.h index 60e560ab..bbf22975 100644 --- a/dnn/src/naive/local/opr_impl.h +++ b/dnn/src/naive/local/opr_impl.h @@ -26,9 +26,9 @@ public: } struct FloatNoncontigBatchKernParam { - const void* src; - const void* filter; - void* dst; + RefPtr src; + RefPtr filter; + RefPtr dst; size_t n, ic, ih, iw, oc, oh, ow, fh, fw; uint32_t ph, pw, sh, sw; ptrdiff_t inp_bs, out_bs; //!< stride for batch of input, output @@ -94,9 +94,9 @@ public: //! define local variables for fields in LocalImpl::FloatNoncontigBatchKernParam #define UNPACK_LOCAL_FLOAT_NONCONTIG_BATCH_KERN_PARAM(_p, _dtype) \ - const _dtype* src = static_cast(_p.src); \ - const _dtype* filter = static_cast(_p.filter); \ - _dtype* dst = static_cast<_dtype*>(_p.dst); \ + const _dtype* src = static_cast(_p.src.get_ptr()); \ + const _dtype* filter = static_cast(_p.filter.get_ptr()); \ + _dtype* dst = static_cast<_dtype*>(_p.dst.get_ptr()); \ _dtype* workspace = static_cast<_dtype*>(_p.workspace); \ const int N = _p.n, IC = _p.ic, IH = _p.ih, IW = _p.iw, OC = _p.oc, OH = _p.oh, \ OW = _p.ow, FH = _p.fh, FW = _p.fw; \ diff --git a/dnn/src/naive/lowbit_utils.cpp b/dnn/src/naive/lowbit_utils.cpp index 5e5b965b..c5ac166b 100644 --- a/dnn/src/naive/lowbit_utils.cpp +++ b/dnn/src/naive/lowbit_utils.cpp @@ -14,7 +14,7 @@ // =================================quint4====================================== void megdnn::naive::uint4_to_uint8(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; auto out_ptr = out.compatible_ptr() + out.layout.span().low_byte; const auto& ly = in.layout; auto dim_in = ly.shape[ly.ndim - 1]; @@ -34,8 +34,8 @@ void megdnn::naive::uint4_to_uint8(const TensorND& in, const TensorND& out) { } void megdnn::naive::uint8_to_uint4(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; - auto out_ptr = static_cast(out.raw_ptr) + out.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; + auto out_ptr = static_cast(out.raw_ptr()) + out.layout.span().low_byte; const auto& ly = in.layout; auto dim_in = ly.shape[ly.ndim - 1]; auto elems = ly.total_nr_elems(); @@ -57,7 +57,7 @@ void megdnn::naive::uint8_to_uint4(const TensorND& in, const TensorND& out) { } void megdnn::naive::uint4_to_int8(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; auto out_ptr = out.compatible_ptr() + out.layout.span().low_byte; const auto& ly = in.layout; int8_t zero_point = (int8_t)ly.dtype.param().zero_point; @@ -78,8 +78,8 @@ void megdnn::naive::uint4_to_int8(const TensorND& in, const TensorND& out) { } void megdnn::naive::int8_to_uint4(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; - auto out_ptr = static_cast(out.raw_ptr) + out.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; + auto out_ptr = static_cast(out.raw_ptr()) + out.layout.span().low_byte; auto zero_point = out.layout.dtype.param().zero_point; const auto& ly = in.layout; auto dim_in = ly.shape[ly.ndim - 1]; @@ -103,8 +103,8 @@ void megdnn::naive::int8_to_uint4(const TensorND& in, const TensorND& out) { // ==================================qint4====================================== void megdnn::naive::int4_to_int8(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; - auto out_ptr = static_cast(out.raw_ptr) + out.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; + auto out_ptr = static_cast(out.raw_ptr()) + out.layout.span().low_byte; const auto& ly = in.layout; auto dim_in = ly.shape[ly.ndim - 1]; auto elems = ly.total_nr_elems(); @@ -124,8 +124,8 @@ void megdnn::naive::int4_to_int8(const TensorND& in, const TensorND& out) { } void megdnn::naive::int8_to_int4(const TensorND& in, const TensorND& out) { - auto in_ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; - auto out_ptr = static_cast(out.raw_ptr) + out.layout.span().low_byte; + auto in_ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; + auto out_ptr = static_cast(out.raw_ptr()) + out.layout.span().low_byte; const auto& ly = in.layout; auto dim_in = ly.shape[ly.ndim - 1]; auto elems = ly.total_nr_elems(); diff --git a/dnn/src/naive/matrix_inverse/opr_impl.cpp b/dnn/src/naive/matrix_inverse/opr_impl.cpp index 47344c15..1ee5f3be 100644 --- a/dnn/src/naive/matrix_inverse/opr_impl.cpp +++ b/dnn/src/naive/matrix_inverse/opr_impl.cpp @@ -23,8 +23,7 @@ size_t MatrixInverseImpl::get_workspace_in_bytes( } template -void MatrixInverseImpl::do_exec( - ctype* dst, const ctype* src, size_t batch, size_t n, void* workspace) { +void do_exec(ctype* dst, const ctype* src, size_t batch, size_t n, void* workspace) { auto row_ptr = static_cast(workspace); auto exmat = reinterpret_cast(row_ptr + n); for (size_t b = 0; b < batch; ++b, src += n * n, dst += n * n) { @@ -81,14 +80,12 @@ void MatrixInverseImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { size_t batch, n; check_exec(src.layout, dst.layout, workspace, &batch, &n); -#define cb(DType) \ - if (dst.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - auto psrc = src.ptr(); \ - auto pdst = dst.ptr(); \ - void* pwk = workspace.raw_ptr; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(pdst, psrc, batch, n, pwk)); \ - return; \ +#define cb(DType) \ + if (dst.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec( \ + dst.ptr(), src.ptr(), batch, n, workspace.raw_ptr)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb diff --git a/dnn/src/naive/matrix_inverse/opr_impl.h b/dnn/src/naive/matrix_inverse/opr_impl.h index 18100cf5..51cf5b17 100644 --- a/dnn/src/naive/matrix_inverse/opr_impl.h +++ b/dnn/src/naive/matrix_inverse/opr_impl.h @@ -22,10 +22,6 @@ public: _megdnn_workspace workspace) override; protected: - template - static void do_exec( - ctype* dst, const ctype* src, size_t batch, size_t n, void* workspace); - size_t get_workspace_in_bytes(size_t batch, size_t n, size_t dtype_size) override; }; diff --git a/dnn/src/naive/matrix_mul/matrix_mul_helper.h b/dnn/src/naive/matrix_mul/matrix_mul_helper.h index fbb08e8a..b2588214 100644 --- a/dnn/src/naive/matrix_mul/matrix_mul_helper.h +++ b/dnn/src/naive/matrix_mul/matrix_mul_helper.h @@ -174,7 +174,7 @@ void exec_matrix_mul_quint4x4x32_helper( static_cast(workspace) + nA.layout.span().dist_byte(), convert_layout(B_layout)}; auto convert_4to8 = [](const TensorND& in, const TensorND& out) { - auto ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; auto out_ptr = out.compatible_ptr() + out.layout.span().low_byte; for (size_t i = 0; i < in.layout.span().dist_elem(); i += 2) { uint8_t val = ptr[i / 2]; @@ -225,7 +225,7 @@ void exec_matrix_mul_qint4x4x16_helper( static_cast(workspace) + nA.layout.span().dist_byte(), convert_layout(B_layout)}; auto convert_4to8 = [](const TensorND& in, const TensorND& out) { - auto ptr = static_cast(in.raw_ptr) + in.layout.span().low_byte; + auto ptr = static_cast(in.raw_ptr()) + in.layout.span().low_byte; auto out_ptr = out.compatible_ptr() + out.layout.span().low_byte; for (size_t i = 0; i < in.layout.span().dist_elem(); i += 2) { int8_t cur = ptr[i / 2]; diff --git a/dnn/src/naive/matrix_mul/opr_impl.cpp b/dnn/src/naive/matrix_mul/opr_impl.cpp index d1187cd2..822ac26e 100644 --- a/dnn/src/naive/matrix_mul/opr_impl.cpp +++ b/dnn/src/naive/matrix_mul/opr_impl.cpp @@ -20,20 +20,6 @@ MIDOUT_DECL(megdnn_naive_matmul) namespace megdnn { namespace naive { -size_t MatrixMulForwardImpl::get_workspace_in_bytes( - const TensorLayout& A, const TensorLayout& B, const TensorLayout&) { - MIDOUT_BEGIN( - megdnn_naive_matmul, - midout_iv("MatrixMulForwardImpl::get_workspace_in_bytes"_hash)) { - if (A.dtype.enumv() == DTypeEnum::Quantized4Asymm || - A.dtype.enumv() == DTypeEnum::QuantizedS4) { - return (A.span().dist_elem() + B.span().dist_elem()) * sizeof(uint8_t); - } - return 0; - } - MIDOUT_END(); -} - template void dispatch_ta_tb( _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, @@ -43,14 +29,14 @@ void dispatch_ta_tb( auto LDA = A.layout.stride[0], LDB = B.layout.stride[0], LDC = C.layout.stride[0]; dispatch_ta_tb( - A.raw_ptr, B.raw_ptr, C.raw_ptr, workspace.raw_ptr, M, N, K, LDA, LDB, LDC, - A.layout.dtype, B.layout.dtype, C.layout.dtype, param.format, + A.raw_ptr(), B.raw_ptr(), C.raw_ptr(), workspace.raw_ptr, M, N, K, LDA, LDB, + LDC, A.layout.dtype, B.layout.dtype, C.layout.dtype, param.format, param.compute_mode); } -void MatrixMulForwardImpl::exec_internal( +void exec_internal( _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, - _megdnn_workspace workspace, const Param& param) { + _megdnn_workspace workspace, const MatrixMulForward::Param& param) { #define DISPATCH(TA, TB) \ if (param.transposeA == TA && param.transposeB == TB) { \ dispatch_ta_tb(A, B, C, workspace, param); \ @@ -75,6 +61,20 @@ void MatrixMulForwardImpl::exec( MIDOUT_END(); } +size_t MatrixMulForwardImpl::get_workspace_in_bytes( + const TensorLayout& A, const TensorLayout& B, const TensorLayout&) { + MIDOUT_BEGIN( + megdnn_naive_matmul, + midout_iv("MatrixMulForwardImpl::get_workspace_in_bytes"_hash)) { + if (A.dtype.enumv() == DTypeEnum::Quantized4Asymm || + A.dtype.enumv() == DTypeEnum::QuantizedS4) { + return (A.span().dist_elem() + B.span().dist_elem()) * sizeof(uint8_t); + } + return 0; + } + MIDOUT_END(); +} + std::vector MatrixMulForwardImpl::get_all_algorithms( const TensorLayout& /*A*/, const TensorLayout& /*B*/, const TensorLayout& /*C*/) { diff --git a/dnn/src/naive/matrix_mul/opr_impl.h b/dnn/src/naive/matrix_mul/opr_impl.h index 543da528..fb40f2fa 100644 --- a/dnn/src/naive/matrix_mul/opr_impl.h +++ b/dnn/src/naive/matrix_mul/opr_impl.h @@ -43,11 +43,6 @@ public: Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; const char* get_algorithm_set_name() const override { return "DEFAULT"; } - -private: - static void exec_internal( - _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, - _megdnn_workspace workspace, const Param& param); }; } // namespace naive diff --git a/dnn/src/naive/mesh_indexing/opr_impl.cpp b/dnn/src/naive/mesh_indexing/opr_impl.cpp index 1fd6711b..46127596 100644 --- a/dnn/src/naive/mesh_indexing/opr_impl.cpp +++ b/dnn/src/naive/mesh_indexing/opr_impl.cpp @@ -44,10 +44,10 @@ namespace megdnn { namespace naive { /* =========================== MeshIndexing ============================ */ - template -void MeshIndexingImpl::exec_mesh_indexing( - const TensorND& src_tensor, const IndexDesc& desc, const TensorND& dst_tensor) { +void exec_mesh_indexing( + const TensorND& src_tensor, const MeshIndexing::IndexDesc& desc, + const TensorND& dst_tensor) { // normal mesh indexing. auto iter = tensor_iter(dst_tensor).begin(); size_t ndim = dst_tensor.layout.ndim; @@ -78,77 +78,81 @@ void MeshIndexingImpl::exec( megdnn_assert_internal(0); } -/* ========================= BatchedMeshIndexing =========================== */ - +/* ========================= IncrMeshIndexing =========================== */ template -void BatchedMeshIndexingImpl::do_exec( - const TensorND& src_tensor, const IndexDesc& desc, const TensorND& dst_tensor) { - auto iter = tensor_iter(dst_tensor).begin(); - size_t ndim = dst_tensor.layout.ndim; - auto ptr = src_tensor.ptr(); - for (size_t dst_idx = 0; dst_idx < dst_tensor.layout.total_nr_elems(); ++dst_idx) { +void exec_incr_mesh_indexing( + const TensorND& data, const TensorND& value, + const IncrMeshIndexing::IndexDesc& desc) { + auto iter = tensor_iter(value).begin(); + size_t ndim = value.layout.ndim; + auto ptr = data.ptr(); + for (size_t idx = 0; idx < value.layout.total_nr_elems(); ++idx) { int index[TensorShape::MAX_NDIM]; std::copy(iter.idx(), iter.idx() + ndim, index); - size_t src_idx = get_index(src_tensor, dst_tensor, desc, index); - *iter = ptr[src_idx]; + size_t data_idx = get_index(data, value, desc, index); + ptr[data_idx] += *iter; ++iter; } } -void BatchedMeshIndexingImpl::exec( - _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out dst, - _megdnn_workspace) { - check_exec(src.layout, dst.layout, desc); - -#define cb(DType) \ - if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(src, desc, dst)); \ - return; \ +void IncrMeshIndexingImpl::exec( + _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, + _megdnn_workspace workspace) { + MEGDNN_MARK_USED_VAR(workspace); + check_exec(data.layout, value.layout, desc); +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_incr_mesh_indexing(data, value, desc)); \ + return; \ } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) - MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) #undef cb megdnn_assert_internal(0); } -/* ============================ Mesh ============================= */ - +/* ========================= BatchedMeshIndexing =========================== */ template -void IncrMeshIndexingImpl::do_exec( - const TensorND& data, const TensorND& value, const IndexDesc& desc) { - auto iter = tensor_iter(value).begin(); - size_t ndim = value.layout.ndim; - auto ptr = data.ptr(); - for (size_t idx = 0; idx < value.layout.total_nr_elems(); ++idx) { +void exec_batched_mesh_indexing( + const TensorND& src_tensor, const BatchedMeshIndexing::IndexDesc& desc, + const TensorND& dst_tensor) { + auto iter = tensor_iter(dst_tensor).begin(); + size_t ndim = dst_tensor.layout.ndim; + auto ptr = src_tensor.ptr(); + for (size_t dst_idx = 0; dst_idx < dst_tensor.layout.total_nr_elems(); ++dst_idx) { int index[TensorShape::MAX_NDIM]; std::copy(iter.idx(), iter.idx() + ndim, index); - size_t data_idx = get_index(data, value, desc, index); - ptr[data_idx] += *iter; + size_t src_idx = get_index(src_tensor, dst_tensor, desc, index); + *iter = ptr[src_idx]; ++iter; } } -void IncrMeshIndexingImpl::exec( - _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, - _megdnn_workspace workspace) { - MEGDNN_MARK_USED_VAR(workspace); - check_exec(data.layout, value.layout, desc); -#define cb(DType) \ - if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(data, value, desc)); \ - return; \ - } +void BatchedMeshIndexingImpl::exec( + _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out dst, + _megdnn_workspace) { + check_exec(src.layout, dst.layout, desc); +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_batched_mesh_indexing(src, desc, dst)); \ + return; \ + } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) #undef cb megdnn_assert_internal(0); } +/* ========================= SetMeshIndexing =========================== */ template -void SetMeshIndexingImpl::do_exec( - const TensorND& data, const TensorND& value, const IndexDesc& desc) { +void exec_set_mesh_indexing( + const TensorND& data, const TensorND& value, + const SetMeshIndexing::IndexDesc& desc) { auto iter = tensor_iter(value).begin(); size_t ndim = value.layout.ndim; auto ptr = data.ptr(); @@ -166,11 +170,12 @@ void SetMeshIndexingImpl::exec( _megdnn_workspace workspace) { MEGDNN_MARK_USED_VAR(workspace); check_exec(data.layout, value.layout, desc); -#define cb(DType) \ - if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(data, value, desc)); \ - return; \ +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_set_mesh_indexing(data, value, desc)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) @@ -179,11 +184,11 @@ void SetMeshIndexingImpl::exec( megdnn_assert_internal(0); } -/* =========================== BatchedMesh =========================== */ - +/* =========================== BatchedIncrMeshIndexing =========================== */ template -void BatchedIncrMeshIndexingImpl::do_exec( - const TensorND& data, const TensorND& value, const IndexDesc& desc) { +void exec_batched_incr_mesh_indexing( + const TensorND& data, const TensorND& value, + const BatchedIncrMeshIndexing::IndexDesc& desc) { auto iter = tensor_iter(value).begin(); size_t ndim = value.layout.ndim; auto ptr = data.ptr(); @@ -201,11 +206,12 @@ void BatchedIncrMeshIndexingImpl::exec( _megdnn_workspace workspace) { MEGDNN_MARK_USED_VAR(workspace); check_exec(data.layout, value.layout, desc); -#define cb(DType) \ - if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(data, value, desc)); \ - return; \ +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_batched_incr_mesh_indexing(data, value, desc)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) @@ -213,9 +219,11 @@ void BatchedIncrMeshIndexingImpl::exec( megdnn_assert_internal(0); } +/* =========================== BatchedSetMeshIndexing =========================== */ template -void BatchedSetMeshIndexingImpl::do_exec( - const TensorND& data, const TensorND& value, const IndexDesc& desc) { +void exec_batched_set_mesh_indexing( + const TensorND& data, const TensorND& value, + const BatchedSetMeshIndexing::IndexDesc& desc) { auto iter = tensor_iter(value).begin(); size_t ndim = value.layout.ndim; auto ptr = data.ptr(); @@ -233,11 +241,12 @@ void BatchedSetMeshIndexingImpl::exec( _megdnn_workspace workspace) { MEGDNN_MARK_USED_VAR(workspace); check_exec(data.layout, value.layout, desc); -#define cb(DType) \ - if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_exec(data, value, desc)); \ - return; \ +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_batched_set_mesh_indexing(data, value, desc)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) diff --git a/dnn/src/naive/mesh_indexing/opr_impl.h b/dnn/src/naive/mesh_indexing/opr_impl.h index 5f6037ba..fdc542b2 100644 --- a/dnn/src/naive/mesh_indexing/opr_impl.h +++ b/dnn/src/naive/mesh_indexing/opr_impl.h @@ -17,75 +17,48 @@ namespace megdnn { namespace naive { class MeshIndexingImpl : public MeshIndexing { - template - void exec_mesh_indexing( - const TensorND& src_tensor, const IndexDesc& desc, - const TensorND& dst_tensor); - public: using MeshIndexing::MeshIndexing; - void exec( _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; }; class IncrMeshIndexingImpl : public IncrMeshIndexing { - template - void do_exec(const TensorND& data, const TensorND& value, const IndexDesc& desc); - public: using IncrMeshIndexing::IncrMeshIndexing; - void exec( _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, _megdnn_workspace workspace) override; }; -class SetMeshIndexingImpl : public SetMeshIndexing { - template - void do_exec(const TensorND& data, const TensorND& value, const IndexDesc& desc); - +class BatchedMeshIndexingImpl : public BatchedMeshIndexing { public: - using SetMeshIndexing::SetMeshIndexing; - + using BatchedMeshIndexing::BatchedMeshIndexing; void exec( - _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, + _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out, _megdnn_workspace workspace) override; }; -class BatchedMeshIndexingImpl : public BatchedMeshIndexing { - template - void do_exec( - const TensorND& src_tensor, const IndexDesc& desc, - const TensorND& dst_tensor); - +class SetMeshIndexingImpl : public SetMeshIndexing { public: - using BatchedMeshIndexing::BatchedMeshIndexing; + using SetMeshIndexing::SetMeshIndexing; void exec( - _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out, + _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, _megdnn_workspace workspace) override; }; class BatchedIncrMeshIndexingImpl : public BatchedIncrMeshIndexing { - template - void do_exec(const TensorND& data, const TensorND& value, const IndexDesc& desc); - public: using BatchedIncrMeshIndexing::BatchedIncrMeshIndexing; - void exec( _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, _megdnn_workspace workspace) override; }; class BatchedSetMeshIndexingImpl : public BatchedSetMeshIndexing { - template - void do_exec(const TensorND& data, const TensorND& value, const IndexDesc& desc); - public: using BatchedSetMeshIndexing::BatchedSetMeshIndexing; - void exec( _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc, _megdnn_workspace workspace) override; diff --git a/dnn/src/naive/padding/opr_impl.cpp b/dnn/src/naive/padding/opr_impl.cpp index f6d946f9..1de57443 100644 --- a/dnn/src/naive/padding/opr_impl.cpp +++ b/dnn/src/naive/padding/opr_impl.cpp @@ -222,7 +222,7 @@ void PaddingBackwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { } size_t n = src.layout.total_nr_elems(); - memset(dst.raw_ptr, 0, dst.layout.access_bytes()); + memset(dst.raw_ptr(), 0, dst.layout.access_bytes()); switch (param().padding_mode) { case param::Padding::PaddingMode::CONSTANT: diff --git a/dnn/src/naive/param_pack/opr_impl.cpp b/dnn/src/naive/param_pack/opr_impl.cpp index 55adbe30..f2853438 100644 --- a/dnn/src/naive/param_pack/opr_impl.cpp +++ b/dnn/src/naive/param_pack/opr_impl.cpp @@ -17,10 +17,10 @@ using namespace megdnn; using namespace naive; template -void ParamPackConcatImpl::exec_internal( +void exec_internal( _megdnn_tensor_in srcs, int32_t* offsets, _megdnn_tensor_out dst, _megdnn_workspace) { - auto srcs_ptr = static_cast(srcs.raw_ptr); + auto srcs_ptr = static_cast(srcs.raw_ptr()); auto dst_ptr = dst.ptr(); int32_t last_pos = 0; @@ -41,14 +41,13 @@ void ParamPackConcatImpl::exec( _megdnn_tensor_in srcs, _megdnn_tensor_in offsets, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(dst.layout, offsets.layout, srcs.layout); - auto offsets_ptr = offsets.ptr(); -#define cb(DType) \ - if (dst.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - exec_internal(srcs, offsets_ptr, dst, workspace)); \ - return; \ +#define cb(DType) \ + if (dst.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_internal(srcs, offsets.ptr(), dst, workspace)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) megdnn_throw("bad type"); diff --git a/dnn/src/naive/param_pack/opr_impl.h b/dnn/src/naive/param_pack/opr_impl.h index 9e759bf9..d2c1ba8d 100644 --- a/dnn/src/naive/param_pack/opr_impl.h +++ b/dnn/src/naive/param_pack/opr_impl.h @@ -24,12 +24,6 @@ public: const TensorShapeArray&, const TensorShape&, const TensorShape&) override { return 0; } - -private: - template - void exec_internal( - _megdnn_tensor_in srcs, int32_t* offsets, _megdnn_tensor_out dst, - _megdnn_workspace workspace); }; } // namespace naive diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp index d2c93846..c655893f 100644 --- a/dnn/src/naive/pooling/opr_impl.cpp +++ b/dnn/src/naive/pooling/opr_impl.cpp @@ -429,27 +429,21 @@ void PoolingForwardImpl::exec( auto wsb = get_workspace_bundle(workspace.raw_ptr, src.layout, dst.layout); if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { float scale = src.layout.dtype.param().scale; - comp_src.layout.dtype = dtype::QuantizedS8(scale); - comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); - comp_src.layout.init_contiguous_stride(); - comp_src.raw_ptr = wsb.get(0); - comp_dst.layout.dtype = dtype::QuantizedS8(scale); - comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); - comp_dst.layout.init_contiguous_stride(); - comp_dst.raw_ptr = wsb.get(1); + TensorLayout src_layout{comp_src.layout, dtype::QuantizedS8(scale)}; + comp_src = TensorND{wsb.get(0), src_layout}; + TensorLayout dst_layout{comp_dst.layout, dtype::QuantizedS8(scale)}; + comp_dst = TensorND{wsb.get(1), dst_layout}; int4_to_int8(src, comp_src); } else if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { float scale = src.layout.dtype.param().scale; uint8_t zero_point = src.layout.dtype.param().zero_point; - comp_src.layout.dtype = dtype::Quantized8Asymm(scale, zero_point); - comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); - comp_src.layout.init_contiguous_stride(); - comp_src.raw_ptr = wsb.get(0); - comp_dst.layout.dtype = dtype::Quantized8Asymm(scale, zero_point); - comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); - comp_dst.layout.init_contiguous_stride(); - comp_dst.raw_ptr = wsb.get(1); + TensorLayout src_layout{ + comp_src.layout, dtype::Quantized8Asymm(scale, zero_point)}; + comp_src = TensorND{wsb.get(0), src_layout}; + TensorLayout dst_layout{ + comp_dst.layout, dtype::Quantized8Asymm(scale, zero_point)}; + comp_dst = TensorND{wsb.get(1), dst_layout}; uint4_to_uint8(src, comp_src); } @@ -510,8 +504,9 @@ void PoolingForwardImpl::exec( MEGDNN_DISPATCH_CPU_KERN( \ static_cast(handle()), \ pooling_forward_impl( \ - sptr, dptr, comp_src.layout.dtype, N, C, IH, IW, OH, OW, PH, \ - PW, SH, SW, FH, FW)); \ + comp_src.ptr(), comp_dst.ptr(), \ + comp_src.layout.dtype, N, C, IH, IW, OH, OW, PH, PW, SH, SW, \ + FH, FW)); \ } \ MIDOUT_END(); @@ -553,20 +548,14 @@ void PoolingForwardImpl::exec( using ctype = typename DTypeTrait::ctype; \ switch (param().mode) { \ case Mode::MAX: { \ - auto sptr = comp_src.ptr(); \ - auto dptr = comp_dst.ptr(); \ DISPATCH_WITH_POOLER(MaxPooler); \ break; \ } \ case Mode::AVERAGE: { \ - auto sptr = comp_src.ptr(); \ - auto dptr = comp_dst.ptr(); \ DISPATCH_WITH_POOLER(MeanIncludePooler); \ break; \ } \ case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: { \ - auto sptr = comp_src.ptr(); \ - auto dptr = comp_dst.ptr(); \ DISPATCH_WITH_POOLER(MeanExcludePooler); \ break; \ } \ @@ -709,12 +698,12 @@ void PoolingBackwardImpl::exec( size_t PH = param().pad_h, PW = param().pad_w; size_t FH = param().window_h, FW = param().window_w; size_t SH = param().stride_h, SW = param().stride_w; -#define DISPATCH_WITH_FUNC_AND_IDX_GETTER(Func, ctype, IdxGetter) \ - MEGDNN_DISPATCH_CPU_KERN( \ - static_cast(handle()), \ - Func( \ - sptr, dptr, diffptr, gradptr, N, C, IH, IW, OH, OW, PH, PW, SH, \ - SW, FH, FW)); +#define DISPATCH_WITH_FUNC_AND_IDX_GETTER(Func, ctype, IdxGetter) \ + MEGDNN_DISPATCH_CPU_KERN( \ + static_cast(handle()), \ + Func( \ + src.ptr(), dst.ptr(), diff.ptr(), \ + grad.ptr(), N, C, IH, IW, OH, OW, PH, PW, SH, SW, FH, FW)); #define DISPATCH_WITH_FUNC(Func, ctype) \ switch (param().format) { \ @@ -728,31 +717,25 @@ void PoolingBackwardImpl::exec( megdnn_throw("invalid pooling format"); \ } -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - switch (param().mode) { \ - case Mode::AVERAGE: { \ - auto sptr = src.ptr(), dptr = dst.ptr(), \ - diffptr = diff.ptr(), gradptr = grad.ptr(); \ - DISPATCH_WITH_FUNC(pooling_backward_avg_impl, ctype); \ - break; \ - } \ - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: { \ - auto sptr = src.ptr(), dptr = dst.ptr(), \ - diffptr = diff.ptr(), gradptr = grad.ptr(); \ - DISPATCH_WITH_FUNC(pooling_backward_avg_expd_impl, ctype); \ - break; \ - } \ - case Mode::MAX: { \ - auto sptr = src.ptr(), dptr = dst.ptr(), \ - diffptr = diff.ptr(), gradptr = grad.ptr(); \ - DISPATCH_WITH_FUNC(pooling_backward_max_impl, ctype); \ - break; \ - } \ - default: \ - megdnn_assert_internal(0); \ - } \ +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + switch (param().mode) { \ + case Mode::AVERAGE: { \ + DISPATCH_WITH_FUNC(pooling_backward_avg_impl, ctype); \ + break; \ + } \ + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: { \ + DISPATCH_WITH_FUNC(pooling_backward_avg_expd_impl, ctype); \ + break; \ + } \ + case Mode::MAX: { \ + DISPATCH_WITH_FUNC(pooling_backward_max_impl, ctype); \ + break; \ + } \ + default: \ + megdnn_assert_internal(0); \ + } \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/reduce/opr_impl.cpp b/dnn/src/naive/reduce/opr_impl.cpp index c1dd5fa7..da7b1dbb 100644 --- a/dnn/src/naive/reduce/opr_impl.cpp +++ b/dnn/src/naive/reduce/opr_impl.cpp @@ -192,13 +192,13 @@ void dispatch_dtype( megdnn::naive::HandleImpl* handle, const TensorND& src, const TensorND& dst, size_t A, size_t B, size_t C) { switch (src.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - auto sptr = src.ptr(), dptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN( \ - handle, reduce_fwd(sptr, dptr, A, B, C)); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN( \ + handle, reduce_fwd( \ + src.ptr(), dst.ptr(), A, B, C)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) @@ -286,7 +286,7 @@ void ReduceForwardImpl::exec( auto typecvt = handle()->create_operator(); auto copy_to = [&typecvt](const TensorND& from, const TensorND& to) { - if (from.raw_ptr != to.raw_ptr) + if (from.raw_ptr() != to.raw_ptr()) typecvt->exec(from, to); }; diff --git a/dnn/src/naive/relayout_format/opr_impl.cpp b/dnn/src/naive/relayout_format/opr_impl.cpp index cf1c3dca..0f2f9100 100644 --- a/dnn/src/naive/relayout_format/opr_impl.cpp +++ b/dnn/src/naive/relayout_format/opr_impl.cpp @@ -35,9 +35,11 @@ void recursive_cp( dst_offset + i * dst.layout.stride[idx]); } } else { + auto src_ptr = src.ptr(); + auto dst_ptr = dst.ptr(); for (size_t i = 0; i < src.layout[idx]; ++i) { - ((ctype*)dst.raw_ptr)[dst_offset + i * dst.layout.stride[idx]] = - ((ctype*)src.raw_ptr)[src_offset + i * src.layout.stride[idx]]; + dst_ptr[dst_offset + i * dst.layout.stride[idx]] = + src_ptr[src_offset + i * src.layout.stride[idx]]; } } } @@ -61,10 +63,10 @@ void lowbit_recursive_cp( megdnn_assert(dst.layout.stride[idx] == 1); size_t dim_bytes = div_ceil(src.layout[idx], 8_z / size_nbits); // offset in elements - uint8_t* dptr = - reinterpret_cast(dst.raw_ptr) + (dst_offset * size_nbits / 8); - uint8_t* sptr = - reinterpret_cast(src.raw_ptr) + (src_offset * size_nbits / 8); + uint8_t* dptr = reinterpret_cast(dst.raw_ptr()) + + (dst_offset * size_nbits / 8); + uint8_t* sptr = reinterpret_cast(src.raw_ptr()) + + (src_offset * size_nbits / 8); for (size_t i = 0; i < dim_bytes; ++i) { *dptr = *sptr; dptr++; @@ -116,9 +118,9 @@ void extract_from_workspace( const size_t n_offset_dst = nid * n_stride_dst_in_bytes; const size_t n_offset_src = nid * n_stride_src_in_bytes; for (size_t gid = 0; gid < group; ++gid) { - memcpy((char*)dst.raw_ptr + n_offset_dst + + memcpy(reinterpret_cast(dst.raw_ptr()) + n_offset_dst + gid * ocpg * dst_c_stride_in_bytes, - (char*)src.raw_ptr + n_offset_src + + reinterpret_cast(src.raw_ptr()) + n_offset_src + gid * icpg * src_c_stride_in_bytes, ocpg * dst_c_stride_in_bytes); } @@ -475,13 +477,19 @@ void RelayoutFormatImpl::exec( src.layout.dtype.category() == DTypeCategory::QUANTIZED); check_exec(src.layout, dst.layout, workspace.size); HandleImpl* m_handle = static_cast(handle()); - TensorLayout exec_src, exec_dst, exec_workspace; - deduce_exec_layout(src.layout, dst.layout, exec_workspace, exec_src, exec_dst); - TensorND exec_src_nd{src.raw_ptr, exec_src}; - TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; + TensorLayout exec_src_layout, exec_dst_layout, exec_workspace_layout; + deduce_exec_layout( + src.layout, dst.layout, exec_workspace_layout, exec_src_layout, + exec_dst_layout); + // clean dst MEGDNN_DISPATCH_CPU_KERN( - m_handle, memset(dst.raw_ptr, 0, dst.layout.span().dist_byte())); + m_handle, memset(dst.raw_ptr(), 0, dst.layout.span().dist_byte())); + + //! construct exec Tensor + TensorND exec_src_nd{exec_src_layout, src.get_ref_ptr()}; + TensorND exec_dst_nd{exec_dst_layout, dst.get_ref_ptr()}; + // pre if (param().mode == Param::Mode::NCHW_NHWCD4I) { size_t N = src.layout[0]; @@ -496,10 +504,10 @@ void RelayoutFormatImpl::exec( MIDOUT_BEGIN( \ megdnn_naive_relayout_format, ctype, \ midout_iv(Param::Mode::NCHW_NHWCD4I)) { \ - ctype* sptr = src.compatible_ptr(); \ - ctype* dptr = workspace.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN(m_handle, padding_src_to_workspace( \ - dptr, sptr, N, IC, IH, IW);); \ + MEGDNN_DISPATCH_CPU_KERN( \ + m_handle, padding_src_to_workspace( \ + workspace.ptr(), \ + src.compatible_ptr(), N, IC, IH, IW);); \ } \ MIDOUT_END(); \ break; \ @@ -512,7 +520,7 @@ void RelayoutFormatImpl::exec( default: megdnn_assert(0); } - exec_src_nd.raw_ptr = workspace.raw_ptr; + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; } } else if (param().mode == Param::Mode::INTER_WEIGHT_DENSEI_DOT) { size_t OC = src.layout[0]; @@ -526,10 +534,10 @@ void RelayoutFormatImpl::exec( MIDOUT_BEGIN( \ megdnn_naive_relayout_format, ctype, \ midout_iv(Param::Mode::INTER_WEIGHT_DENSEI_DOT)) { \ - ctype* sptr = src.compatible_ptr(); \ - ctype* dptr = workspace.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN(m_handle, padding_filter_to_workspace( \ - dptr, sptr, OC, IC, FH, FW);); \ + MEGDNN_DISPATCH_CPU_KERN( \ + m_handle, padding_filter_to_workspace( \ + workspace.ptr(), \ + src.compatible_ptr(), OC, IC, FH, FW);); \ } \ MIDOUT_END(); \ break; \ @@ -540,35 +548,38 @@ void RelayoutFormatImpl::exec( default: megdnn_assert(0); } - exec_src_nd.raw_ptr = workspace.raw_ptr; + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; } - } else if (param().mode == Param::Mode::NCHW_NCHW88) { + } #define cb(_idx, _pack_size, _mode) \ MIDOUT_BEGIN(megdnn_naive_relayout_format, midout_iv(Param::Mode::_mode)) { \ size_t val = src.layout[_idx]; \ if (val % _pack_size != 0) { \ - padding_to_workspace( \ - {workspace.raw_ptr, exec_src}, src, _idx, _pack_size, \ - exec_dst.dtype); \ - exec_src_nd.raw_ptr = workspace.raw_ptr; \ + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; \ + MEGDNN_DISPATCH_CPU_KERN( \ + m_handle, padding_to_workspace( \ + exec_src_nd, src, _idx, _pack_size, \ + exec_dst_layout.dtype)); \ } \ } \ MIDOUT_END(); - -#define cb2(_idx, _pack_size, _mode, _src_layout, _workspace_layout) \ - MIDOUT_BEGIN(megdnn_naive_relayout_format, midout_iv(Param::Mode::_mode)) { \ - size_t val = _src_layout[_idx]; \ - if (val % _pack_size != 0) { \ - memset(workspace.raw_ptr, 0, exec_src.span().dist_byte()); \ - padding_to_workspace( \ - {workspace.raw_ptr, _workspace_layout}, \ - {src.raw_ptr, _src_layout}); \ - exec_src_nd.raw_ptr = workspace.raw_ptr; \ - } \ - } \ +#define cb2(_idx, _pack_size, _mode, _src_layout, _workspace_layout) \ + MIDOUT_BEGIN(megdnn_naive_relayout_format, midout_iv(Param::Mode::_mode)) { \ + size_t val = _src_layout[_idx]; \ + if (val % _pack_size != 0) { \ + MEGDNN_DISPATCH_CPU_KERN(m_handle, \ + memset(workspace.raw_ptr, 0, \ + exec_src_layout.span().dist_byte());); \ + TensorND tmp_dst{workspace.raw_ptr, _workspace_layout}; \ + TensorND tmp_src{_src_layout, src.get_ref_ptr()}; \ + MEGDNN_DISPATCH_CPU_KERN( \ + m_handle, padding_to_workspace(tmp_dst, tmp_src)); \ + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; \ + } \ + } \ MIDOUT_END(); + else if (param().mode == Param::Mode::NCHW_NCHW88) { cb(1, 8, NCHW_NCHW88); - } else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT) { megdnn_assert(src.layout[0] % 8 == 0); cb(1, 8, NCHW_NCHW88_CONV_DENSE_WEIGHT); @@ -608,97 +619,84 @@ void RelayoutFormatImpl::exec( } MEGDNN_DISPATCH_CPU_KERN( m_handle, memset(workspace.raw_ptr, zp, - exec_workspace.span().dist_byte())); - TensorND ws_nd(workspace.raw_ptr, exec_workspace); + exec_workspace_layout.span().dist_byte())); + TensorND ws_nd(workspace.raw_ptr, exec_workspace_layout); MEGDNN_DISPATCH_CPU_KERN(m_handle, padding_to_workspace(ws_nd, src);); - exec_src_nd.raw_ptr = workspace.raw_ptr; + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; } } MIDOUT_END(); } else if (param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) { cb(1, 4, NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT); - } else if (param().mode == Param::Mode::NCHW_NCHW4_WEIGHT) { #undef cb -#define cb(_idx0, _idx1, _pack_size, _mode) \ - MIDOUT_BEGIN(megdnn_naive_relayout_format, midout_iv(Param::Mode::_mode)) { \ - size_t val0 = src.layout[_idx0]; \ - size_t val1 = src.layout[_idx1]; \ - if (val0 % _pack_size != 0 || val1 % _pack_size != 0) { \ - memset(workspace.raw_ptr, 0, exec_src.span().dist_byte()); \ - padding_to_workspace({workspace.raw_ptr, exec_workspace}, src); \ - exec_src_nd.raw_ptr = workspace.raw_ptr; \ - } \ - } \ +#undef cb2 + } else if (param().mode == Param::Mode::NCHW_NCHW4_WEIGHT) { +#define cb(_idx0, _idx1, _pack_size, _mode) \ + MIDOUT_BEGIN(megdnn_naive_relayout_format, midout_iv(Param::Mode::_mode)) { \ + size_t val0 = src.layout[_idx0]; \ + size_t val1 = src.layout[_idx1]; \ + if (val0 % _pack_size != 0 || val1 % _pack_size != 0) { \ + MEGDNN_DISPATCH_CPU_KERN( \ + m_handle, \ + memset(workspace.raw_ptr, 0, exec_src_layout.span().dist_byte())); \ + TensorND ws_nd(workspace.raw_ptr, exec_workspace_layout); \ + MEGDNN_DISPATCH_CPU_KERN(m_handle, padding_to_workspace(ws_nd, src);); \ + exec_src_nd = TensorND{workspace.raw_ptr, exec_src_nd.layout}; \ + } \ + } \ MIDOUT_END(); if (src.layout.ndim == 4) { cb(0, 1, 4, NCHW_NCHW4_WEIGHT); } else if (src.layout.ndim == 5) { cb(1, 2, 4, NCHW_NCHW4_WEIGHT); } +#undef cb } else if (param().mode == Param::Mode::NCHW4_NCHW) { - if (exec_workspace.total_nr_elems() != dst.layout.total_nr_elems()) { - exec_dst_nd = {workspace.raw_ptr, exec_workspace}; + if (exec_workspace_layout.total_nr_elems() != dst.layout.total_nr_elems()) { + exec_dst_nd = TensorND{workspace.raw_ptr, exec_workspace_layout}; } } else if (param().mode == Param::Mode::NCHW64_NCHW) { - if (exec_workspace.total_nr_elems() != dst.layout.total_nr_elems()) { - exec_dst_nd = {workspace.raw_ptr, exec_workspace}; + if (exec_workspace_layout.total_nr_elems() != dst.layout.total_nr_elems()) { + exec_dst_nd = TensorND{workspace.raw_ptr, exec_workspace_layout}; } } // do relayout if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm && dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_qu8_q8(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_qu8_q8(exec_dst_nd, exec_src_nd)); } else if ( src.layout.dtype.enumv() == DTypeEnum::Uint8 && dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_u8_q8(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_u8_q8(exec_dst_nd, exec_src_nd)); } else if ( src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 && dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_q8_q8(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_q8_q8(exec_dst_nd, exec_src_nd)); } else if ( src.layout.dtype.enumv() == DTypeEnum::QuantizedS32 && dst.layout.dtype.enumv() == DTypeEnum::QuantizedS32) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_q32_q32(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_q32_q32(exec_dst_nd, exec_src_nd)); } else if ( src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 && dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_q4_q4(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_q4_q4(exec_dst_nd, exec_src_nd)); } else if ( src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm && dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; - check_layout_and_canonize(src0.layout, src0.layout); - auto func = [](const TensorND& dst, const TensorND& src) { - do_copy_diff_qu4_qu4(dst, src); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); + check_layout_and_canonize(exec_src_nd.layout, exec_dst_nd.layout); + MEGDNN_DISPATCH_CPU_KERN( + m_handle, do_copy_diff_qu4_qu4(exec_dst_nd, exec_src_nd)); } else { m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle()); } @@ -706,14 +704,13 @@ void RelayoutFormatImpl::exec( // post if (param().mode == Param::Mode::NCHW4_NCHW || param().mode == Param::Mode::NCHW64_NCHW) { - if (exec_workspace.total_nr_elems() != dst.layout.total_nr_elems()) { - megdnn_assert(exec_workspace.dtype == dst.layout.dtype); - TensorND ws_nd{workspace.raw_ptr, exec_workspace}; + if (exec_workspace_layout.total_nr_elems() != dst.layout.total_nr_elems()) { + megdnn_assert(exec_workspace_layout.dtype == dst.layout.dtype); + TensorND ws_nd{workspace.raw_ptr, exec_workspace_layout}; MEGDNN_DISPATCH_CPU_KERN( m_handle, extract_from_workspace(dst, ws_nd, param().group);); } } -#undef cb } // vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/repeat/repeat.cpp b/dnn/src/naive/repeat/opr_impl.cpp similarity index 98% rename from dnn/src/naive/repeat/repeat.cpp rename to dnn/src/naive/repeat/opr_impl.cpp index f86cc5aa..ba928735 100644 --- a/dnn/src/naive/repeat/repeat.cpp +++ b/dnn/src/naive/repeat/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/repeat/repeat.cpp + * \file dnn/src/naive/repeat/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. diff --git a/dnn/src/naive/resize/opr_impl.cpp b/dnn/src/naive/resize/opr_impl.cpp index 6bbcb80e..7530a026 100644 --- a/dnn/src/naive/resize/opr_impl.cpp +++ b/dnn/src/naive/resize/opr_impl.cpp @@ -82,8 +82,8 @@ ResizeImpl::KernParam ResizeImpl::KernParam::from_tensors( src.layout.dtype.enumv() == DTypeEnum::Uint8 || src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) { - ret.sptr = src.compatible_ptr(); - ret.dptr = dst.compatible_ptr(); + ret.sptr = src.get_ref_ptr(); + ret.dptr = dst.get_ref_ptr(); } else { megdnn_assert( 0, "current do not support dtype %s in resize", diff --git a/dnn/src/naive/resize/opr_impl.h b/dnn/src/naive/resize/opr_impl.h index fa7554dd..85eaf4b5 100644 --- a/dnn/src/naive/resize/opr_impl.h +++ b/dnn/src/naive/resize/opr_impl.h @@ -27,12 +27,16 @@ public: InterpolationMode imode; size_t n, c, ih, iw, oh, ow; ptrdiff_t s_in, s_ic, s_ih, s_iw; - ctype *sptr, *dptr; + RefPtr sptr, dptr; Workspace workspace; static KernParam from_tensors( Format format, InterpolationMode imode, _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace); + + const ctype* src() const { return static_cast(sptr.get_ptr()); } + + ctype* dst() const { return static_cast(dptr.get_ptr()); } }; using Resize::Resize; @@ -66,8 +70,8 @@ private: #define UNPACK_RESIZE_FWD_KERN_PARAM(p) \ auto N = p.n, C = p.c, IH = p.ih, IW = p.iw, OH = p.oh, OW = p.ow; \ - ctype* __restrict sptr = p.sptr; \ - ctype* __restrict dptr = p.dptr; + ctype* __restrict sptr = static_cast(p.sptr.get_ptr()); \ + ctype* __restrict dptr = static_cast(p.dptr.get_ptr()); #define UNPACK_RESIZE_FWD_KERN_PARAM_WITH_STRIDE(p) \ UNPACK_RESIZE_FWD_KERN_PARAM(p) \ diff --git a/dnn/src/naive/rng/opr_impl.cpp b/dnn/src/naive/rng/opr_impl.cpp index 781300d0..04df7b50 100644 --- a/dnn/src/naive/rng/opr_impl.cpp +++ b/dnn/src/naive/rng/opr_impl.cpp @@ -291,11 +291,11 @@ void UniformRNGImpl::exec(_megdnn_tensor_inout dst, _megdnn_workspace workspace) auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - auto ptr = dst.ptr::ctype>(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR({ fill_uniform(prng, ptr, size); }); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR({ fill_uniform(prng, dst.ptr(), size); }); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -309,14 +309,13 @@ void GaussianRNGImpl::exec(_megdnn_tensor_inout dst, _megdnn_workspace workspace auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - ctype mean(m_param.mean), std(m_param.std); \ - auto ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - { fill_gaussian(prng, ptr, size, mean, std); }); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + ctype mean(m_param.mean), std(m_param.std); \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + { fill_gaussian(prng, dst.ptr(), size, mean, std); }); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -332,15 +331,15 @@ void GammaRNGImpl::exec( auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - auto ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR({ \ - fill_gamma( \ - prng, ptr, size, shape.ptr(), scale.ptr()); \ - };); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR({ \ + fill_gamma( \ + prng, dst.ptr(), size, shape.ptr(), \ + scale.ptr()); \ + };); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -355,14 +354,13 @@ void PoissonRNGImpl::exec( auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - auto dst_ptr = dst.ptr(); \ - auto lam_ptr = lam.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - { fill_poisson(prng, dst_ptr, lam_ptr, size); };); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR({ \ + fill_poisson(prng, dst.ptr(), lam.ptr(), size); \ + };); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -378,15 +376,15 @@ void BetaRNGImpl::exec( auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - auto dst_ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR({ \ - fill_beta( \ - prng, dst_ptr, alpha.ptr(), beta.ptr(), size); \ - };); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR({ \ + fill_beta( \ + prng, dst.ptr(), alpha.ptr(), beta.ptr(), \ + size); \ + };); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) #undef cb @@ -400,14 +398,14 @@ void PermutationRNGImpl::exec(_megdnn_tensor_inout dst, _megdnn_workspace worksp auto size = dst.layout.total_nr_elems(); auto prng = &m_rng.ensure_seed(m_param.seed); switch (dst.layout.dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: { \ - using ctype = DTypeTrait<_dt>::ctype; \ - ctype max_size = DTypeTrait<_dt>::max() - 1; \ - megdnn_assert((ctype(size) < max_size)); \ - auto ptr = dst.ptr(); \ - MEGDNN_DISPATCH_CPU_KERN_OPR({ fill_permutation(prng, ptr, size); };); \ - return; \ +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + ctype max_size = DTypeTrait<_dt>::max() - 1; \ + megdnn_assert((ctype(size) < max_size)); \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + { fill_permutation(prng, dst.ptr(), size); };); \ + return; \ } cb(::megdnn::dtype::Float32) cb(::megdnn::dtype::Int32) cb(::megdnn::dtype::Int16) @@ -421,9 +419,9 @@ void ShuffleRNGForwardImpl::exec( _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, indices.layout, workspace.size); const auto len = indices.layout[0]; - auto iptr = indices.ptr(); auto prng = &m_rng.ensure_seed(m_param.seed); - fill_permutation(prng, iptr, len); + MEGDNN_DISPATCH_CPU_KERN_OPR( + fill_permutation(prng, indices.ptr(), len)); auto step = 0; for (size_t i = 1; i < src.layout.ndim; ++i) { step += src.layout[i]; @@ -431,12 +429,12 @@ void ShuffleRNGForwardImpl::exec( if (step <= 0) step = 1; -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using T = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - shuffle_fwd(src.ptr(), dst.ptr(), iptr, len, step)); \ - return; \ +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(shuffle_fwd( \ + src.ptr(), dst.ptr(), indices.ptr(), len, step)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb @@ -447,19 +445,18 @@ void ShuffleRNGBackwardImpl::exec( _megdnn_workspace workspace) { check_exec(diff.layout, indices.layout, grad.layout, workspace.size); const auto len = indices.layout[0]; - auto iptr = indices.ptr(); auto step = 0; for (size_t i = 1; i < diff.layout.ndim; ++i) { step += diff.layout[i]; } if (step <= 0) step = 1; -#define cb(DType) \ - if (diff.layout.dtype == DType()) { \ - using T = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - shuffle_bwd(grad.ptr(), diff.ptr(), iptr, len, step)); \ - return; \ +#define cb(DType) \ + if (diff.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(shuffle_bwd( \ + grad.ptr(), diff.ptr(), indices.ptr(), len, step)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/rotate/opr_impl.cpp b/dnn/src/naive/rotate/opr_impl.cpp index a87496f4..ea1967ed 100644 --- a/dnn/src/naive/rotate/opr_impl.cpp +++ b/dnn/src/naive/rotate/opr_impl.cpp @@ -20,12 +20,10 @@ namespace megdnn { namespace naive { template -void RotateImpl::exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { +void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst, bool clockwise) { auto N = src.layout.shape[0], IH = src.layout.shape[1], IW = src.layout.shape[2], IC = src.layout.shape[3]; - bool clockwise = param().clockwise; - rep(n, N) rep(ih, IH) rep(iw, IW) { int ow = clockwise ? IH - ih - 1 : ih; int oh = clockwise ? iw : IW - iw - 1; @@ -44,11 +42,12 @@ void RotateImpl::exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { void RotateImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); -#define cb(DType) \ - if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(src, dst)); \ - return; \ +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_internal(src, dst, param().clockwise)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) diff --git a/dnn/src/naive/rotate/opr_impl.h b/dnn/src/naive/rotate/opr_impl.h index a16fbe78..14d07296 100644 --- a/dnn/src/naive/rotate/opr_impl.h +++ b/dnn/src/naive/rotate/opr_impl.h @@ -25,10 +25,6 @@ public: size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { return 0; } - -private: - template - void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst); }; } // namespace naive diff --git a/dnn/src/naive/separable_conv/opr_impl.cpp b/dnn/src/naive/separable_conv/opr_impl.cpp index da9f21b0..8c10e514 100644 --- a/dnn/src/naive/separable_conv/opr_impl.cpp +++ b/dnn/src/naive/separable_conv/opr_impl.cpp @@ -68,44 +68,57 @@ namespace megdnn { namespace naive { // using namespace sep_conv; +size_t SeparableConvForwardImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter_x, + const TensorLayout& filter_y, const TensorLayout& dst) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter_y); + MEGDNN_MARK_USED_VAR(dst); + auto kw = filter_x.shape[3]; + auto kh = kw; + auto ic = filter_x.shape[1]; + auto oc = filter_x.shape[0]; + TensorLayout layout({oc, ic, kh, kw}, dtype::Float32()); + return oc * ic * kh * kw * sizeof(float); +} + void SeparableConvForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y, _megdnn_tensor_in dst, _megdnn_workspace workspace) { check_exec( src.layout, filter_x.layout, filter_y.layout, dst.layout, workspace.size); - // Create kernel tensor - int kw = filter_x.layout.shape[3]; - int kh = kw; - int ic = filter_x.layout.shape[1]; - int oc = filter_x.layout.shape[0]; - - TensorLayout kerLayout( - {(size_t)oc, (size_t)ic, (size_t)kh, (size_t)kw}, dtype::Float32()); - void* filter2d_buf = malloc(oc * ic * kh * kw * sizeof(float)); - TensorND filter2d(filter2d_buf, kerLayout); - float* kerx = (float*)filter_x.raw_ptr; - float* kery = (float*)filter_y.raw_ptr; - float* ker2d = (float*)filter2d_buf; + auto kw = filter_x.layout.shape[3]; + auto kh = kw; + auto ic = filter_x.layout.shape[1]; + auto oc = filter_x.layout.shape[0]; - // Generate 2D-filter - int k_pos = 0; - for (int cn = 0; cn < ic * oc; ++cn) { - for (int h = 0; h < kh; ++h) { - for (int w = 0; w < kw; ++w) { - ker2d[k_pos++] = kerx[w] * kery[h]; + auto transform_filter_2d = [=]() { + auto kerx = static_cast(filter_x.raw_ptr()); + auto kery = static_cast(filter_y.raw_ptr()); + auto filter2d_ptr = workspace.ptr(); + // Generate 2D-filter + size_t k_pos = 0; + for (size_t cn = 0; cn < ic * oc; ++cn) { + for (size_t h = 0; h < kh; ++h) { + for (size_t w = 0; w < kw; ++w) { + filter2d_ptr[k_pos++] = kerx[w] * kery[h]; + } } + kerx += kw; + kery += kw; } - kerx += kw; - kery += kw; - } + }; - ConvolutionForwardImpl* convOptr = new ConvolutionForwardImpl(this->handle()); - Workspace empty_wsp; - convOptr->exec(src, filter2d, dst, nullptr, empty_wsp); - delete (convOptr); + MEGDNN_DISPATCH_CPU_KERN_OPR(transform_filter_2d()); + + //! construct filter 2D tensor + TensorLayout layout({oc, ic, kh, kw}, dtype::Float32()); + TensorND filter2d(workspace.raw_ptr, layout); - free(filter2d_buf); + auto conv_opr = handle()->create_operator(); + Workspace empty_wsp; + conv_opr->exec(src, filter2d, dst, nullptr, empty_wsp); } } // namespace naive diff --git a/dnn/src/naive/separable_conv/opr_impl.h b/dnn/src/naive/separable_conv/opr_impl.h index 91386019..5f395241 100644 --- a/dnn/src/naive/separable_conv/opr_impl.h +++ b/dnn/src/naive/separable_conv/opr_impl.h @@ -25,10 +25,7 @@ public: size_t get_workspace_in_bytes( const TensorLayout&, const TensorLayout&, const TensorLayout&, - const TensorLayout&) override { - // TODO: deduce the size of ring buffer. - return 0; - } + const TensorLayout&) override; }; } // namespace naive diff --git a/dnn/src/naive/separable_filter/opr_impl.cpp b/dnn/src/naive/separable_filter/opr_impl.cpp index 76bc2ceb..f15652df 100644 --- a/dnn/src/naive/separable_filter/opr_impl.cpp +++ b/dnn/src/naive/separable_filter/opr_impl.cpp @@ -116,9 +116,11 @@ struct remap_func_holder { using namespace megcv; Mat kx_( - 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr)); + 1, filter_x.layout.shape[3], 1, + static_cast(filter_x.raw_ptr())); Mat ky_( - 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr)); + 1, filter_y.layout.shape[3], 1, + static_cast(filter_y.raw_ptr())); uint32_t kernel_height = ky_.width(); uint32_t kernel_width = kx_.width(); @@ -160,10 +162,10 @@ struct remap_func_holder { }; template -void SeparableFilterForwardImpl::exec_internal( +void exec_internal( _megdnn_tensor_in src, _megdnn_tensor_in kx, _megdnn_tensor_in ky, - _megdnn_tensor_out dst) { - switch (param().borderMode) { + _megdnn_tensor_out dst, const param::WarpPerspective::BorderMode& mode) { + switch (mode) { #define cb(bmode) \ case param::WarpPerspective::BorderMode::bmode: \ return remap_func_holder:: \ @@ -187,12 +189,12 @@ void SeparableFilterForwardImpl::exec( check_exec( src.layout, filter_x.layout, filter_y.layout, dst.layout, workspace.size); -#define cb(dt) \ - if (src.layout.dtype == dt()) { \ - using ctype = typename DTypeTrait
::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - exec_internal(src, filter_x, filter_y, dst)); \ - return; \ +#define cb(dt) \ + if (src.layout.dtype == dt()) { \ + using ctype = typename DTypeTrait
::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal( \ + src, filter_x, filter_y, dst, param().borderMode)); \ + return; \ } cb(dtype::Uint8); cb(dtype::Float32); diff --git a/dnn/src/naive/separable_filter/opr_impl.h b/dnn/src/naive/separable_filter/opr_impl.h index 4de0f0a8..72132b45 100644 --- a/dnn/src/naive/separable_filter/opr_impl.h +++ b/dnn/src/naive/separable_filter/opr_impl.h @@ -26,12 +26,6 @@ public: const TensorLayout&) override { return 0; } - -private: - template - void exec_internal( - _megdnn_tensor_in src, _megdnn_tensor_in filter_x, - _megdnn_tensor_in filter_y, _megdnn_tensor_out dst); }; } // namespace naive diff --git a/dnn/src/naive/split/split.cpp b/dnn/src/naive/split/opr_impl.cpp similarity index 98% rename from dnn/src/naive/split/split.cpp rename to dnn/src/naive/split/opr_impl.cpp index 9c069d07..cdb421d5 100644 --- a/dnn/src/naive/split/split.cpp +++ b/dnn/src/naive/split/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/split/split.cpp + * \file dnn/src/naive/split/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. diff --git a/dnn/src/naive/svd/opr_impl.cpp b/dnn/src/naive/svd/opr_impl.cpp index 24241edd..a0c6c717 100644 --- a/dnn/src/naive/svd/opr_impl.cpp +++ b/dnn/src/naive/svd/opr_impl.cpp @@ -293,13 +293,10 @@ WorkspaceBundle SVDForwardImpl::get_workspace_bundle( } template -void SVDForwardImpl::exec_internal( +void exec_internal( _megdnn_tensor_in src, _megdnn_tensor_out u, _megdnn_tensor_out s, - _megdnn_tensor_out vt, _megdnn_workspace workspace, Param p) { - size_t block_cnt, m, n; - canonize_params(src.layout, &block_cnt, &m, &n); - - auto wbundle = get_workspace_bundle(m, n, sizeof(T), workspace.raw_ptr); + _megdnn_tensor_out vt, SVD::Param p, WorkspaceBundle wbundle, size_t block_cnt, + size_t m, size_t n) { const size_t max_mn = std::max(m, n); const size_t min_mn = std::min(m, n); const size_t src_block_size = src.layout.dtype.size(m * n); @@ -388,9 +385,13 @@ void SVDForwardImpl::exec( !p.compute_uv || !p.full_matrices, "Computing full singular vectors is not supported in naive " "implementation."); + size_t block_cnt, m, n; + canonize_params(src.layout, &block_cnt, &m, &n); if (src.layout.dtype == dtype::Float32()) { using ctype = typename DTypeTrait::ctype; - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(src, u, s, vt, workspace, p)); + auto wbundle = get_workspace_bundle(m, n, sizeof(ctype), workspace.raw_ptr); + MEGDNN_DISPATCH_CPU_KERN_OPR( + exec_internal(src, u, s, vt, p, wbundle, block_cnt, m, n)); return; } megdnn_assert_internal(0); diff --git a/dnn/src/naive/svd/opr_impl.h b/dnn/src/naive/svd/opr_impl.h index 7da48d96..3ae18757 100644 --- a/dnn/src/naive/svd/opr_impl.h +++ b/dnn/src/naive/svd/opr_impl.h @@ -27,10 +27,6 @@ public: _megdnn_tensor_out vt, _megdnn_workspace workspace) override; private: - template - void exec_internal( - _megdnn_tensor_in src, _megdnn_tensor_out u, _megdnn_tensor_out s, - _megdnn_tensor_out vt, _megdnn_workspace workspace, Param p); WorkspaceBundle get_workspace_bundle( size_t m, size_t n, size_t dtype_size, void* raw_ptr = nullptr); }; diff --git a/dnn/src/naive/tile/tile.cpp b/dnn/src/naive/tile/opr_impl.cpp similarity index 87% rename from dnn/src/naive/tile/tile.cpp rename to dnn/src/naive/tile/opr_impl.cpp index 5c0c9671..b91817d8 100644 --- a/dnn/src/naive/tile/tile.cpp +++ b/dnn/src/naive/tile/opr_impl.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/tile/tile.cpp + * \file dnn/src/naive/tile/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -18,7 +18,7 @@ namespace megdnn { namespace naive { template -void TileForwardImpl::exec_internal( +void exec_tile_forward( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace /* workspace */) { auto ndim = src.layout.ndim; @@ -38,11 +38,11 @@ void TileForwardImpl::exec_internal( void TileForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(src, dst, workspace)); \ - return; \ +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(exec_tile_forward(src, dst, workspace)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb @@ -50,7 +50,7 @@ void TileForwardImpl::exec( } template -void TileBackwardImpl::exec_internal( +void exec_tile_backward( _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace /* workspace */) { auto ndim = diff.layout.ndim; @@ -72,11 +72,12 @@ void TileBackwardImpl::exec_internal( void TileBackwardImpl::exec( _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { check_exec(diff.layout, grad.layout, workspace.size); -#define cb(DType) \ - if (diff.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR(exec_internal(diff, grad, workspace)); \ - return; \ +#define cb(DType) \ + if (diff.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + exec_tile_backward(diff, grad, workspace)); \ + return; \ } MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/tile/opr_impl.h b/dnn/src/naive/tile/opr_impl.h index b7b6bac7..cbfd1b54 100644 --- a/dnn/src/naive/tile/opr_impl.h +++ b/dnn/src/naive/tile/opr_impl.h @@ -23,11 +23,6 @@ public: size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { return 0; } - -private: - template - void exec_internal( - _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace); }; class TileBackwardImpl : public TileBackward { @@ -39,12 +34,6 @@ public: size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { return 0; } - -private: - template - void exec_internal( - _megdnn_tensor_in diff, _megdnn_tensor_out grad, - _megdnn_workspace workspace); }; } // namespace naive diff --git a/dnn/src/naive/topk/opr_impl.cpp b/dnn/src/naive/topk/opr_impl.cpp index c1faf974..66303df3 100644 --- a/dnn/src/naive/topk/opr_impl.cpp +++ b/dnn/src/naive/topk/opr_impl.cpp @@ -123,5 +123,3 @@ size_t TopKImpl::get_workspace_in_bytes( MEGDNN_MARK_USED_VAR(indices); return std::max(sizeof(uint32_t), data.dtype.size()) * 2 * data[1]; } - -// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/transpose/opr_impl.cpp b/dnn/src/naive/transpose/opr_impl.cpp index 7892975d..0b989183 100644 --- a/dnn/src/naive/transpose/opr_impl.cpp +++ b/dnn/src/naive/transpose/opr_impl.cpp @@ -16,6 +16,15 @@ namespace megdnn { namespace naive { +template +void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst) { + auto m = dst.layout.shape[0], n = dst.layout.shape[1]; + rep(i, m) rep(j, n) { + dst.ptr()[i * dst.layout.stride[0] + j] = + src.ptr()[j * src.layout.stride[0] + i]; + } +} + void TransposeForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); @@ -31,16 +40,6 @@ void TransposeForwardImpl::exec( megdnn_assert_internal(0); } -template -void TransposeForwardImpl::exec_internal( - _megdnn_tensor_in src, _megdnn_tensor_out dst) { - auto m = dst.layout.shape[0], n = dst.layout.shape[1]; - rep(i, m) rep(j, n) { - dst.ptr()[i * dst.layout.stride[0] + j] = - src.ptr()[j * src.layout.stride[0] + i]; - } -} - } // namespace naive } // namespace megdnn diff --git a/dnn/src/naive/transpose/opr_impl.h b/dnn/src/naive/transpose/opr_impl.h index 106de68d..a0dd0a7f 100644 --- a/dnn/src/naive/transpose/opr_impl.h +++ b/dnn/src/naive/transpose/opr_impl.h @@ -23,10 +23,6 @@ public: size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { return 0; } - -private: - template - static void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_out dst); }; } // namespace naive diff --git a/dnn/src/naive/warp_affine/opr_impl.h b/dnn/src/naive/warp_affine/opr_impl.h index 18011d88..025042e6 100644 --- a/dnn/src/naive/warp_affine/opr_impl.h +++ b/dnn/src/naive/warp_affine/opr_impl.h @@ -22,8 +22,7 @@ public: struct KernParam { Format format; size_t n_src, n_mat, c, ih, iw, oh, ow; - ctype *sptr, *dptr; - mtype* mptr; + RefPtr src_ptr, dst_ptr, mat_ptr; Workspace workspace; static KernParam from_tensors( @@ -58,13 +57,13 @@ public: src.layout.dtype.enumv() == DTypeEnum::Uint8 || src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) { - ret.sptr = src.compatible_ptr(); - ret.mptr = mat.ptr(); - ret.dptr = dst.compatible_ptr(); + ret.src_ptr = src.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); } else { - ret.sptr = nullptr; - ret.mptr = nullptr; - ret.dptr = nullptr; + ret.src_ptr = nullptr; + ret.mat_ptr = nullptr; + ret.dst_ptr = nullptr; } ret.workspace = workspace; return ret; @@ -97,9 +96,9 @@ private: #define UNPACK_WARP_AFFINE_FWD_KERN_PARAM(p) \ auto N_SRC = p.n_src, N_MAT = p.n_mat, C = p.c, IH = p.ih, IW = p.iw, OH = p.oh, \ OW = p.ow; \ - ctype* __restrict sptr = p.sptr; \ - mtype* __restrict mptr = p.mptr; \ - ctype* __restrict dptr = p.dptr; + ctype* __restrict sptr = static_cast(p.src_ptr.get_ptr()); \ + mtype* __restrict mptr = static_cast(p.mat_ptr.get_ptr()); \ + ctype* __restrict dptr = static_cast(p.dst_ptr.get_ptr()); } // namespace naive } // namespace megdnn diff --git a/dnn/src/naive/warp_affine/warp_affine_cv.cpp b/dnn/src/naive/warp_affine/warp_affine_cv.cpp index 675ac055..711c1fbb 100644 --- a/dnn/src/naive/warp_affine/warp_affine_cv.cpp +++ b/dnn/src/naive/warp_affine/warp_affine_cv.cpp @@ -167,16 +167,16 @@ void megdnn::naive::warp_affine_cv_exec( megdnn_assert( ch == 1 || ch == 3 || ch == 2, "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); - const float* trans_ptr = trans.ptr(); + if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ Mat src_mat = TensorND2Mat(src, batch_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ - const float* task_trans_ptr = trans_ptr + batch_id * 2 * 3; \ + auto task_trans_ptr = trans.ptr() + batch_id * 2 * 3; \ warp_affine_cv< \ float MEGDNN_COMMA _imode MEGDNN_COMMA _bmode MEGDNN_COMMA _ch>( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ @@ -189,13 +189,13 @@ void megdnn::naive::warp_affine_cv_exec( } else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) { #undef cb #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ Mat src_mat = TensorND2Mat(src, batch_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ - const float* task_trans_ptr = trans_ptr + batch_id * 2 * 3; \ + auto task_trans_ptr = trans.ptr() + batch_id * 2 * 3; \ warp_affine_cv< \ uchar MEGDNN_COMMA _imode MEGDNN_COMMA _bmode MEGDNN_COMMA _ch>( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ diff --git a/dnn/src/naive/warp_perspective/opr_impl.cpp b/dnn/src/naive/warp_perspective/opr_impl.cpp index 9286d6aa..9fbc035d 100644 --- a/dnn/src/naive/warp_perspective/opr_impl.cpp +++ b/dnn/src/naive/warp_perspective/opr_impl.cpp @@ -522,40 +522,28 @@ void WarpPerspectiveForwardImpl::exec( src.layout, mat.layout, mat_idx.layout, dst.layout, workspace.size); size_t batch = dst.layout[0]; -#define cb(dt, ct, mct) \ - case DTypeTrait
::enumv: { \ - auto kparam = KernParam::from_tensors( \ - param().format, param().bmode, param().border_val, src, mat, mat_idx, \ - dst, workspace); \ - auto run = [kparam, this](size_t index, size_t) { \ - kern_naive(kparam, index); \ - }; \ - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh* batch); \ - return; \ - } - -#define KERN_CD4(ct, mct) \ +#define KERN_NAIVE(ct, mct) \ auto kparam = KernParam::from_tensors( \ param().format, param().bmode, param().border_val, src, mat, mat_idx, dst, \ workspace); \ - auto run = [kparam, this](size_t index, size_t) { \ - kern_naive_nhwcd4(kparam, index); \ - }; \ - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, batch* oh); + auto run = [kparam, this](size_t index, size_t) { kern_naive(kparam, index); }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh* batch); -#define KERN(ct, mct) \ +#define KERN_INT4(ct, mct) \ auto kparam = KernParam::from_tensors( \ param().format, param().bmode, param().border_val, src, mat, mat_idx, dst, \ workspace); \ - auto run = [kparam, this](size_t index, size_t) { kern_naive(kparam, index); }; \ + auto run = [kparam, this](size_t index, size_t) { \ + kern_naive_int4(kparam, index); \ + }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh* batch); -#define KERN_INT4(ct, mct) \ +#define KERN_CD4(ct, mct) \ auto kparam = KernParam::from_tensors( \ param().format, param().bmode, param().border_val, src, mat, mat_idx, dst, \ workspace); \ auto run = [kparam, this](size_t index, size_t) { \ - kern_naive_int4(kparam, index); \ + kern_naive_nhwcd4(kparam, index); \ }; \ MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh* batch); @@ -577,7 +565,6 @@ void WarpPerspectiveForwardImpl::exec( } if (param().format == Format::NHWCD4) { - size_t oh = dst.layout[1]; DISPATCH_ST(dtype::Float32, float, float, KERN_CD4); DISPATCH_ST(dtype::Quantized8Asymm, uint8_t, float, KERN_CD4); DISPATCH_ST(dtype::QuantizedS8, int8_t, float, KERN_CD4); @@ -590,22 +577,20 @@ void WarpPerspectiveForwardImpl::exec( src.layout.dtype.name()) .c_str()); } +#undef KERN_CD4 if (src.layout.dtype.enumv() == DTypeTrait::enumv) { DISPATCH_ST(dtype::QuantizedS4, dt_qint4, float, KERN_INT4); - megdnn_throw(ssprintf( - "Unsupported input DType in " - "WarpPerspective: %s", - src.layout.dtype.name()) - .c_str()); + megdnn_assert( + 0, "Unsupported input DType in WarpPerspective: %s", + src.layout.dtype.name()); } else if (src.layout.dtype.enumv() == DTypeTrait::enumv) { DISPATCH_ST(dtype::Quantized4Asymm, dt_quint4, float, KERN_INT4); - megdnn_throw(ssprintf( - "Unsupported input DType in " - "WarpPerspective: %s", - src.layout.dtype.name()) - .c_str()); + megdnn_assert( + 0, "Unsupported input DType in WarpPerspective: %s", + src.layout.dtype.name()); } +#undef KERN_INT4 bool is_fusion_dtype = src.layout.dtype.enumv() != dst.layout.dtype.enumv(); bool is_u8_or_qu8_in = @@ -616,48 +601,40 @@ void WarpPerspectiveForwardImpl::exec( ((param().format == Format::NCHW_NCHW4_IC_SMALL) || (param().format == Format::NHWC_NCHW4_IC_SMALL) || (param().format == Format::NHWC_NCHW) || (param().format == Format::NCHW))) { - if (src.layout.dtype.enumv() == DTypeTrait::enumv || - src.layout.dtype.enumv() == DTypeTrait::enumv) { - float scale = 1.f; - - if (src.layout.dtype.enumv() == DTypeTrait::enumv) { - scale = src.layout.dtype.param().scale; - } + megdnn_assert( + src.layout.dtype.enumv() == DTypeTrait::enumv || + src.layout.dtype.enumv() == DTypeTrait::enumv, + "Unsupported input DType in WarpPerspective: %s", + src.layout.dtype.name()); - auto kparam = KernParam::from_tensors( - param().format, param().bmode, param().border_val, src, mat, - mat_idx, dst, workspace); - - if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { - auto run = [kparam, this](size_t index, size_t) { - kern_naive_dimshuffle_typecvt(kparam, index); - }; - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh * batch); - return; - } else if ( - (dst.layout.dtype.enumv() == - DTypeTrait::enumv) && - (dst.layout.dtype.param().scale == scale)) { - auto run = [kparam, this](size_t index, size_t) { - kern_naive_dimshuffle_typecvt( - kparam, index); - }; - MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh * batch); - return; - } else { - megdnn_throw(ssprintf( - "Unsupported DType in " - "WarpPerspective Dimshuffle Typecvt: %s", - src.layout.dtype.name()) - .c_str()); - } + float scale = 1.f; + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { + scale = src.layout.dtype.param().scale; } - megdnn_throw(ssprintf( - "Unsupported input DType in " - "WarpPerspective: %s", - src.layout.dtype.name()) - .c_str()); + auto kparam = KernParam::from_tensors( + param().format, param().bmode, param().border_val, src, mat, mat_idx, + dst, workspace); + + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { + auto run = [kparam, this](size_t index, size_t) { + kern_naive_dimshuffle_typecvt(kparam, index); + }; + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh * batch); + return; + } else if ( + dst.layout.dtype.enumv() == DTypeTrait::enumv && + dst.layout.dtype.param().scale == scale) { + auto run = [kparam, this](size_t index, size_t) { + kern_naive_dimshuffle_typecvt(kparam, index); + }; + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN_OPR(run, kparam.oh * batch); + return; + } else { + megdnn_assert( + 0, "Unsupported DType in WarpPerspective Dimshuffle Typecvt: %s", + src.layout.dtype.name()); + } } if (warp::is_cv_available( @@ -680,24 +657,23 @@ void WarpPerspectiveForwardImpl::exec( * if input type is float16. */ - DISPATCH_ST(dtype::Float32, float, float, KERN); - DISPATCH_ST(dtype::Int8, int8_t, float, KERN); - DISPATCH_ST(dtype::QuantizedS8, int8_t, float, KERN); - DISPATCH_ST(dtype::Uint8, uint8_t, float, KERN); - DISPATCH_ST(dtype::Quantized8Asymm, uint8_t, float, KERN); + DISPATCH_ST(dtype::Float32, float, float, KERN_NAIVE); + DISPATCH_ST(dtype::Int8, int8_t, float, KERN_NAIVE); + DISPATCH_ST(dtype::QuantizedS8, int8_t, float, KERN_NAIVE); + DISPATCH_ST(dtype::Uint8, uint8_t, float, KERN_NAIVE); + DISPATCH_ST(dtype::Quantized8Asymm, uint8_t, float, KERN_NAIVE); - DNN_INC_FLOAT16(DISPATCH_ST_MT(dtype::Float16, dt_float16, KERN)); - DNN_INC_FLOAT16(DISPATCH_ST_MT(dtype::BFloat16, dt_bfloat16, KERN)); + DNN_INC_FLOAT16(DISPATCH_ST_MT(dtype::Float16, dt_float16, KERN_NAIVE)); + DNN_INC_FLOAT16(DISPATCH_ST_MT(dtype::BFloat16, dt_bfloat16, KERN_NAIVE)); megdnn_throw(ssprintf( "Unsupported input DType in " "WarpPerspective: %s", src.layout.dtype.name()) .c_str()); } +#undef KERN_NAIVE #undef DISPATCH_ST_MT #undef DISPATCH_ST -#undef KERN -#undef KERN_CD4 } template @@ -706,13 +682,10 @@ void WarpPerspectiveBackwardDataImpl::kern_naive( const int N = kern_param.n_mat, C = kern_param.c, IH = kern_param.ih, IW = kern_param.iw; const int OH = kern_param.oh, OW = kern_param.ow; - const ctype* hptr_ = kern_param.hptr; - const mtype* mptr_ = kern_param.mptr; - ctype* sptr_ = kern_param.sptr; - int* midx_ptr = kern_param.midx_ptr; - auto hptr = hptr_; - auto mptr = mptr_; - auto sptr = sptr_; + auto hptr = static_cast(kern_param.diff_ptr.get_ptr()); + auto mptr = static_cast(kern_param.mat_ptr.get_ptr()); + auto sptr = static_cast(kern_param.grad_ptr.get_ptr()); + auto midx_ptr = static_cast(kern_param.midx_ptr.get_ptr()); if (midx_ptr) { std::memset(sptr, 0, sizeof(ctype) * kern_param.n_src * C * IH * IW); } else { @@ -720,9 +693,10 @@ void WarpPerspectiveBackwardDataImpl::kern_naive( } rep(n, N) { if (midx_ptr) { - sptr = sptr_ + midx_ptr[n] * C * IH * IW; + sptr = static_cast(kern_param.grad_ptr.get_ptr()) + + midx_ptr[n] * C * IH * IW; } else { - sptr = sptr_ + n * C * IH * IW; + sptr = static_cast(kern_param.grad_ptr.get_ptr()) + n * C * IH * IW; } rep(oh, OH) rep(ow, OW) { float numeratorw = mptr[0] * ow + mptr[1] * oh + mptr[2]; @@ -799,18 +773,19 @@ void WarpPerspectiveBackwardMatImpl::kern_naive( IW = kern_param.iw; const int OH = kern_param.oh, OW = kern_param.ow; - auto hptr = kern_param.hptr; - auto sptr = kern_param.sptr; - auto mptr = kern_param.mptr; - auto res = kern_param.res; - auto midx_ptr = kern_param.midx_ptr; + auto hptr = static_cast(kern_param.diff_ptr.get_ptr()); + auto sptr = static_cast(kern_param.src_ptr.get_ptr()); + auto res = static_cast(kern_param.grad_ptr.get_ptr()); + auto mptr = static_cast(kern_param.mat_ptr.get_ptr()); + auto midx_ptr = static_cast(kern_param.midx_ptr.get_ptr()); auto border_val = kern_param.border_val; std::memset(res, 0, sizeof(float) * N * 3 * 3); rep(n, N) { if (midx_ptr) { - sptr = kern_param.sptr + midx_ptr[n] * C * IH * IW; + sptr = static_cast(kern_param.src_ptr.get_ptr()) + + midx_ptr[n] * C * IH * IW; } else { - sptr = kern_param.sptr + n * C * IH * IW; + sptr = static_cast(kern_param.src_ptr.get_ptr()) + n * C * IH * IW; } rep(oh, OH) rep(ow, OW) { float numeratorw = mptr[0] * ow + mptr[1] * oh + mptr[2]; diff --git a/dnn/src/naive/warp_perspective/opr_impl.h b/dnn/src/naive/warp_perspective/opr_impl.h index c2e68f6e..44fcb291 100644 --- a/dnn/src/naive/warp_perspective/opr_impl.h +++ b/dnn/src/naive/warp_perspective/opr_impl.h @@ -25,10 +25,9 @@ protected: BorderMode bmode; float border_val; size_t n_src, n_mat, c, ih, iw, oh, ow; - ctype *sptr, *dptr; DType src_dtype, dst_dtype; - mtype* mptr; - int* midx_ptr; //!< can be null + RefPtr src_ptr, mat_ptr, dst_ptr; + RefPtr midx_ptr; //!< can be null Workspace workspace; static KernParam from_tensors( @@ -42,15 +41,17 @@ protected: ret.n_src = src.layout.shape[0]; ret.src_dtype = src.layout.dtype; ret.dst_dtype = dst.layout.dtype; - if (mat_idx.raw_ptr) { + + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); ret.n_mat = mat_idx.layout.shape[0]; - ret.midx_ptr = mat_idx.ptr(); + ret.midx_ptr = mat_idx.get_ref_ptr(); } else { megdnn_assert(mat_idx.layout.ndim == 0); ret.n_mat = ret.n_src; ret.midx_ptr = nullptr; } + if (format == Format::NCHW || format == Format::NCHW_NCHW4_IC_SMALL) { ret.c = src.layout.shape[1]; ret.ih = src.layout.shape[2]; @@ -91,6 +92,7 @@ protected: ret.oh = dst.layout.shape[1]; ret.ow = dst.layout.shape[3]; } + if ((src.layout.dtype.enumv() == DTypeEnum::Float32 || DNN_FLOAT16_SELECT( (src.layout.dtype.enumv() == DTypeEnum::Float16 || @@ -101,27 +103,27 @@ protected: src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) && (src.layout.dtype == dst.layout.dtype)) { - ret.sptr = src.compatible_ptr(); - ret.mptr = mat.ptr(); - ret.dptr = dst.compatible_ptr(); + ret.src_ptr = src.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); } else if ( src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - ret.sptr = src.compatible_ptr(); - ret.mptr = mat.ptr(); - ret.dptr = dst.compatible_ptr(); + ret.src_ptr = src.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); } else if ( (src.layout.dtype.enumv() == DTypeEnum::Uint8 || src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) && src.layout.dtype.enumv() != dst.layout.dtype.enumv()) { - ret.sptr = src.compatible_ptr(); - ret.mptr = mat.ptr(); - ret.dptr = reinterpret_cast(dst.raw_ptr); + ret.src_ptr = src.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); } else { - ret.sptr = nullptr; - ret.mptr = nullptr; - ret.dptr = nullptr; + ret.src_ptr = nullptr; + ret.mat_ptr = nullptr; + ret.dst_ptr = nullptr; } ret.workspace = workspace; return ret; @@ -159,9 +161,9 @@ protected: template struct KernParam { size_t n_src, n_mat, c, ih, iw, oh, ow; - ctype *sptr, *hptr; - mtype* mptr; - int* midx_ptr; //!< can be null + RefPtr grad_ptr, diff_ptr; + RefPtr mat_ptr; + RefPtr midx_ptr; static KernParam from_tensors( _megdnn_tensor_in mat, _megdnn_tensor_in mat_idx, @@ -170,13 +172,13 @@ protected: ret.n_src = grad.layout.shape[0], ret.c = grad.layout.shape[1]; ret.ih = grad.layout.shape[2], ret.iw = grad.layout.shape[3]; ret.oh = diff.layout.shape[2], ret.ow = diff.layout.shape[3]; - ret.hptr = diff.ptr(); - ret.mptr = mat.ptr(); - ret.sptr = grad.ptr(); - if (mat_idx.raw_ptr) { + ret.diff_ptr = diff.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.grad_ptr = grad.get_ref_ptr(); + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); ret.n_mat = mat_idx.layout.shape[0]; - ret.midx_ptr = mat_idx.ptr(); + ret.midx_ptr = mat_idx.get_ref_ptr(); } else { megdnn_assert(mat_idx.layout.ndim == 0); ret.n_mat = ret.n_src; @@ -207,9 +209,9 @@ protected: template struct KernParam { size_t n_src, n_mat, c, ih, iw, oh, ow; - ctype *sptr, *hptr; - mtype *mptr, *res; - int* midx_ptr; //!< can be null + RefPtr src_ptr, grad_ptr, diff_ptr; + RefPtr mat_ptr; + RefPtr midx_ptr; float border_val; static KernParam from_tensors( @@ -221,14 +223,14 @@ protected: ret.n_src = src.layout.shape[0], ret.c = src.layout.shape[1]; ret.ih = src.layout.shape[2], ret.iw = src.layout.shape[3]; ret.oh = diff.layout.shape[2], ret.ow = diff.layout.shape[3]; - ret.hptr = diff.ptr(); - ret.mptr = mat.ptr(); - ret.sptr = src.ptr(); - ret.res = grad.ptr(); - if (mat_idx.raw_ptr) { + ret.src_ptr = src.get_ref_ptr(); + ret.diff_ptr = diff.get_ref_ptr(); + ret.mat_ptr = mat.get_ref_ptr(); + ret.grad_ptr = grad.get_ref_ptr(); + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); ret.n_mat = mat_idx.layout.shape[0]; - ret.midx_ptr = mat_idx.ptr(); + ret.midx_ptr = mat_idx.get_ref_ptr(); } else { megdnn_assert(mat_idx.layout.ndim == 0); ret.n_mat = ret.n_src; @@ -258,10 +260,10 @@ private: #define UNPACK_WARP_PERSPECTIVE_FWD_KERN_PARAM(p) \ auto N_SRC = p.n_src, N_MAT = p.n_mat, C = p.c, IH = p.ih, IW = p.iw, OH = p.oh, \ OW = p.ow; \ - ctype* __restrict sptr = p.sptr; \ - mtype* __restrict mptr = p.mptr; \ - ctype* __restrict dptr = p.dptr; \ - int* __restrict midx_ptr = p.midx_ptr; \ + auto sptr = static_cast(p.src_ptr.get_ptr()); \ + auto mptr = static_cast(p.mat_ptr.get_ptr()); \ + auto dptr = static_cast(p.dst_ptr.get_ptr()); \ + auto midx_ptr = static_cast(p.midx_ptr.get_ptr()); \ auto bmode = p.bmode; \ float border_val = p.border_val diff --git a/dnn/src/naive/warp_perspective/warp_perspective_cv.cpp b/dnn/src/naive/warp_perspective/warp_perspective_cv.cpp index 1ff93793..bfb38cdc 100644 --- a/dnn/src/naive/warp_perspective/warp_perspective_cv.cpp +++ b/dnn/src/naive/warp_perspective/warp_perspective_cv.cpp @@ -169,20 +169,18 @@ void megdnn::naive::warp_perspective_cv_exec( megdnn_assert( ch == 1 || ch == 3 || ch == 2, "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); - const float* trans_ptr = trans.ptr(); - const int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); - midx_ptr = mat_idx.ptr(); } if (dst.layout.dtype.enumv() == DTypeEnum::Float32) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ - if (midx_ptr) { \ + if (mat_idx.raw_ptr()) { \ + auto midx_ptr = mat_idx.ptr(); \ src_id = midx_ptr[batch_id]; \ megdnn_assert( \ src_id < src.layout.shape[0], \ @@ -191,7 +189,7 @@ void megdnn::naive::warp_perspective_cv_exec( } \ Mat src_mat = TensorND2Mat(src, src_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ - const float* task_trans_ptr = trans_ptr + batch_id * 3 * 3; \ + const float* task_trans_ptr = trans.ptr() + batch_id * 3 * 3; \ warp_perspective_cv< \ float MEGDNN_COMMA _imode MEGDNN_COMMA _bmode MEGDNN_COMMA _ch>( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ @@ -204,12 +202,13 @@ void megdnn::naive::warp_perspective_cv_exec( #undef cb } else if (dst.layout.dtype.enumv() == DTypeEnum::Uint8) { #define cb(_imode, _bmode, _ch) \ - auto task = [src, trans_ptr, midx_ptr, dst, border_value, parallelism_batch]( \ + auto task = [src, trans, mat_idx, dst, border_value, parallelism_batch]( \ size_t index, size_t) { \ size_t batch_id = index / parallelism_batch; \ size_t task_id = index % parallelism_batch; \ size_t src_id = batch_id; \ - if (midx_ptr) { \ + if (mat_idx.raw_ptr()) { \ + auto midx_ptr = mat_idx.ptr(); \ src_id = midx_ptr[batch_id]; \ megdnn_assert( \ src_id < src.layout.shape[0], \ @@ -218,7 +217,7 @@ void megdnn::naive::warp_perspective_cv_exec( } \ Mat src_mat = TensorND2Mat(src, src_id); \ Mat dst_mat = TensorND2Mat(dst, batch_id); \ - const float* task_trans_ptr = trans_ptr + batch_id * 3 * 3; \ + const float* task_trans_ptr = trans.ptr() + batch_id * 3 * 3; \ warp_perspective_cv< \ uchar MEGDNN_COMMA _imode MEGDNN_COMMA _bmode MEGDNN_COMMA _ch>( \ src_mat MEGDNN_COMMA const_cast&>(dst_mat) \ diff --git a/dnn/src/rocm/argmxx/opr_impl.cpp b/dnn/src/rocm/argmxx/opr_impl.cpp index 3c7a724b..ae547496 100644 --- a/dnn/src/rocm/argmxx/opr_impl.cpp +++ b/dnn/src/rocm/argmxx/opr_impl.cpp @@ -12,7 +12,7 @@ #include "hcc_detail/hcc_defs_prologue.h" #include "src/common/argmxx_helper.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "src/rocm/reduce_helper.h.hip" #include "src/rocm/utils.h" diff --git a/dnn/src/rocm/batch_normalization/opr_impl.cpp b/dnn/src/rocm/batch_normalization/opr_impl.cpp index 908f5977..749898d1 100644 --- a/dnn/src/rocm/batch_normalization/opr_impl.cpp +++ b/dnn/src/rocm/batch_normalization/opr_impl.cpp @@ -60,21 +60,23 @@ void BNForwardImpl::exec( miopen_check(miopenBatchNormalizationForwardTraining( handle, m_tensor_desc.bn_mode, &alpha, &beta, m_tensor_desc.xy_desc.desc, // xDesc - src.raw_ptr, // x + src.raw_ptr(), // x m_tensor_desc.xy_desc.desc, // yDesc - dst.raw_ptr, // y + dst.raw_ptr(), // y m_tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc - bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, mean.raw_ptr, - variance.raw_ptr, m_param.epsilon, batch_mean.raw_ptr, - batch_inv_variance.raw_ptr)); + bn_scale.raw_ptr(), bn_bias.raw_ptr(), m_param.avg_factor, + mean.raw_ptr(), variance.raw_ptr(), m_param.epsilon, + batch_mean.raw_ptr(), batch_inv_variance.raw_ptr())); break; case param::BN::FwdMode::INFERENCE: miopen_check(miopenBatchNormalizationForwardInference( handle, m_tensor_desc.bn_mode, &alpha, &beta, - m_tensor_desc.xy_desc.desc, src.raw_ptr, m_tensor_desc.xy_desc.desc, - dst.raw_ptr, m_tensor_desc.param_desc.desc, bn_scale.raw_ptr, - bn_bias.raw_ptr, mean.raw_ptr, variance.raw_ptr, m_param.epsilon)); + m_tensor_desc.xy_desc.desc, src.raw_ptr(), + m_tensor_desc.xy_desc.desc, dst.raw_ptr(), + m_tensor_desc.param_desc.desc, bn_scale.raw_ptr(), + bn_bias.raw_ptr(), mean.raw_ptr(), variance.raw_ptr(), + m_param.epsilon)); break; default: megdnn_throw("Unknown forward mode type of batch normalization."); @@ -96,11 +98,11 @@ void BNBackwardImpl::exec( float alpha = 1.0, beta = 0.0; miopen_check(miopenBatchNormalizationBackward( handle, m_tensor_desc.bn_mode, &alpha, &beta, &alpha, &beta, - m_tensor_desc.xy_desc.desc, x.raw_ptr, m_tensor_desc.xy_desc.desc, - dy.raw_ptr, m_tensor_desc.xy_desc.desc, dx.raw_ptr, - m_tensor_desc.param_desc.desc, bn_scale.raw_ptr, d_bn_scale.raw_ptr, - d_bn_bias.raw_ptr, m_param.epsilon, saved_batch_mean.raw_ptr, - saved_batch_inv_variance.raw_ptr)); + m_tensor_desc.xy_desc.desc, x.raw_ptr(), m_tensor_desc.xy_desc.desc, + dy.raw_ptr(), m_tensor_desc.xy_desc.desc, dx.raw_ptr(), + m_tensor_desc.param_desc.desc, bn_scale.raw_ptr(), d_bn_scale.raw_ptr(), + d_bn_bias.raw_ptr(), m_param.epsilon, saved_batch_mean.raw_ptr(), + saved_batch_inv_variance.raw_ptr())); } } // namespace rocm diff --git a/dnn/src/rocm/batched_matrix_mul/blas.cpp b/dnn/src/rocm/batched_matrix_mul/blas.cpp index 8e212b1c..35053d70 100644 --- a/dnn/src/rocm/batched_matrix_mul/blas.cpp +++ b/dnn/src/rocm/batched_matrix_mul/blas.cpp @@ -71,11 +71,12 @@ void BatchedMatrixMulForwardImpl::AlgoBlas::exec(const ExecArgs& args) const { : rocblas_operation_none, args.opr->param().transposeA ? rocblas_operation_transpose : rocblas_operation_none, - n, m, k, one, args.tensor_b.raw_ptr, rocblas_datatype_i8_r, - args.layout_b.stride[1], args.layout_b.stride[0], args.tensor_a.raw_ptr, - rocblas_datatype_i8_r, args.layout_a.stride[1], args.layout_a.stride[0], - zero, args.tensor_c.raw_ptr, rocblas_datatype_i32_r, - args.layout_c.stride[1], args.layout_c.stride[0], args.tensor_c.raw_ptr, + n, m, k, one, args.tensor_b.raw_ptr(), rocblas_datatype_i8_r, + args.layout_b.stride[1], args.layout_b.stride[0], + args.tensor_a.raw_ptr(), rocblas_datatype_i8_r, args.layout_a.stride[1], + args.layout_a.stride[0], zero, args.tensor_c.raw_ptr(), + rocblas_datatype_i32_r, args.layout_c.stride[1], + args.layout_c.stride[0], args.tensor_c.raw_ptr(), rocblas_datatype_i32_r, args.layout_c.stride[1], args.layout_c.stride[0], batch, rocblas_datatype_i32_r, rocblas_gemm_algo_standard, solution_index, flags, &ws_size, nullptr)); @@ -93,12 +94,12 @@ void BatchedMatrixMulForwardImpl::AlgoBlas::exec(const ExecArgs& args) const { args.opr->param().transposeA ? rocblas_operation_transpose : rocblas_operation_none, n, m, k, reinterpret_cast(one_half), - static_cast(args.tensor_b.raw_ptr), + static_cast(args.tensor_b.raw_ptr()), args.layout_b.stride[1], args.layout_b.stride[0], - static_cast(args.tensor_a.raw_ptr), + static_cast(args.tensor_a.raw_ptr()), args.layout_a.stride[1], args.layout_a.stride[0], reinterpret_cast(zero_half), - static_cast(args.tensor_c.raw_ptr), + static_cast(args.tensor_c.raw_ptr()), args.layout_c.stride[1], args.layout_c.stride[0], batch)); }; #endif diff --git a/dnn/src/rocm/checksum/opr_impl.cpp b/dnn/src/rocm/checksum/opr_impl.cpp index a0dfad1a..6686ed27 100644 --- a/dnn/src/rocm/checksum/opr_impl.cpp +++ b/dnn/src/rocm/checksum/opr_impl.cpp @@ -46,7 +46,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( check_exec(data.layout, workspace.size); auto stream = hip_stream(handle()); - auto ptr = static_cast(data.raw_ptr); + auto ptr = static_cast(data.raw_ptr()); size_t size_all = data.layout.shape[0], size_ints = size_all / sizeof(uint32_t); auto last_val_size = std::min(size_all, 4); hip_check(hipMemcpyAsync( @@ -55,7 +55,7 @@ ChecksumForward::Result ChecksumForwardImpl::exec( if (size_ints) { checksum::calc( static_cast(wbundle.get(1)), - static_cast(data.raw_ptr), + static_cast(data.raw_ptr()), static_cast(wbundle.get(0)), size_ints, stream); hip_check(hipMemcpyAsync( &result.checksum, wbundle.get(1), sizeof(result.checksum), diff --git a/dnn/src/rocm/convolution/backward_data/matmul.cpp b/dnn/src/rocm/convolution/backward_data/matmul.cpp index 8aef381e..20e32491 100644 --- a/dnn/src/rocm/convolution/backward_data/matmul.cpp +++ b/dnn/src/rocm/convolution/backward_data/matmul.cpp @@ -75,7 +75,7 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(const ExecArgs& args TensorND A(args.filter_tensor->ptr(), Al), B(col, Bl), C(diff_t, Cl); if (fm.should_flip) { convolution::flip_filter( - args.as_fwd_args(), wbundle.get_workspace(2), A.raw_ptr); + args.as_fwd_args(), wbundle.get_workspace(2), A.get_ref_ptr()); } args.handle->matmul_aT_opr()->exec(A, C, B, Workspace()); } diff --git a/dnn/src/rocm/convolution/backward_data/miopen.cpp b/dnn/src/rocm/convolution/backward_data/miopen.cpp index 4e2a08cd..ce7ff0a4 100644 --- a/dnn/src/rocm/convolution/backward_data/miopen.cpp +++ b/dnn/src/rocm/convolution/backward_data/miopen.cpp @@ -76,9 +76,9 @@ miopenConvBwdDataAlgorithm_t ConvolutionBackwardDataImpl::AlgoMIOpen::find_best_ int ret_algo_count; miopenConvAlgoPerf_t algo_perf; miopen_check(miopenFindConvolutionBackwardDataAlgorithm( - args.handle->miopen_handle(), D.diff_desc.desc, args.diff_tensor->raw_ptr, - D.filter_desc.desc, args.filter_tensor->raw_ptr, D.conv_desc.desc, - D.grad_desc.desc, args.grad_tensor->raw_ptr, req_algo_count, + args.handle->miopen_handle(), D.diff_desc.desc, args.diff_tensor->raw_ptr(), + D.filter_desc.desc, args.filter_tensor->raw_ptr(), D.conv_desc.desc, + D.grad_desc.desc, args.grad_tensor->raw_ptr(), req_algo_count, &ret_algo_count, &algo_perf, args.workspace.raw_ptr, args.workspace.size, exhaustive_search)); sm_miopen_algo_cache.set(args, algo_perf.bwd_data_algo); @@ -94,9 +94,10 @@ void ConvolutionBackwardDataImpl::AlgoMIOpen::exec(const ExecArgs& args) const { float alpha = 1.0f, beta = 0.0f; auto status = miopenConvolutionBackwardData( args.handle->miopen_handle(), &alpha, D.diff_desc.desc, - args.diff_tensor->raw_ptr, D.filter_desc.desc, args.filter_tensor->raw_ptr, - D.conv_desc.desc, algo, &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr, - args.workspace.raw_ptr, args.workspace.size); + args.diff_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.desc, algo, &beta, + D.grad_desc.desc, args.grad_tensor->raw_ptr(), args.workspace.raw_ptr, + args.workspace.size); megdnn_assert( status == miopenStatusSuccess, "conv bwd_data failed: %s; info: %s", miopenGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/rocm/convolution/backward_filter/matmul.cpp b/dnn/src/rocm/convolution/backward_filter/matmul.cpp index c61431f8..e1b91a61 100644 --- a/dnn/src/rocm/convolution/backward_filter/matmul.cpp +++ b/dnn/src/rocm/convolution/backward_filter/matmul.cpp @@ -80,16 +80,16 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(const ExecArgs& ar Cl({OC, OH * OW * N}, typename DTypeTrait::dtype()); TensorND A(args.grad_tensor->ptr(), Al), B(col, Bl), C(diff_t, Cl); if (fm.should_flip) { - A.raw_ptr = wbundle.get(2); + A.reset_ptr(wbundle.get(2)); } args.handle->matmul_bT_opr()->exec(C, B, A, Workspace()); if (fm.should_flip) { convolution::flip_filter( args.as_fwd_args(), - {static_cast(args.grad_tensor->raw_ptr), + {static_cast(args.grad_tensor->raw_ptr()), wbundle.get_size(2)}, - A.raw_ptr); + A.get_ref_ptr()); } } } diff --git a/dnn/src/rocm/convolution/backward_filter/miopen.cpp b/dnn/src/rocm/convolution/backward_filter/miopen.cpp index 69a35fd7..60a6b7ca 100644 --- a/dnn/src/rocm/convolution/backward_filter/miopen.cpp +++ b/dnn/src/rocm/convolution/backward_filter/miopen.cpp @@ -78,9 +78,9 @@ miopenConvBwdWeightsAlgorithm_t ConvolutionBackwardFilterImpl::AlgoMIOpen:: int ret_algo_count; miopenConvAlgoPerf_t algo_perf; miopen_check(miopenFindConvolutionBackwardWeightsAlgorithm( - args.handle->miopen_handle(), D.diff_desc.desc, args.diff_tensor->raw_ptr, - D.src_desc.desc, args.src_tensor->raw_ptr, D.conv_desc.desc, - D.grad_desc.desc, args.grad_tensor->raw_ptr, req_algo_count, + args.handle->miopen_handle(), D.diff_desc.desc, args.diff_tensor->raw_ptr(), + D.src_desc.desc, args.src_tensor->raw_ptr(), D.conv_desc.desc, + D.grad_desc.desc, args.grad_tensor->raw_ptr(), req_algo_count, &ret_algo_count, &algo_perf, args.workspace.raw_ptr, args.workspace.size, exhaustive_search)); // algo_perf.bwd_weights_algo = miopenConvolutionBwdWeightsAlgoGEMM; @@ -96,9 +96,9 @@ void ConvolutionBackwardFilterImpl::AlgoMIOpen::exec(const ExecArgs& args) const float alpha = 1.0f, beta = 0.0f; auto status = miopenConvolutionBackwardWeights( args.handle->miopen_handle(), &alpha, D.diff_desc.desc, - args.diff_tensor->raw_ptr, D.src_desc.desc, args.src_tensor->raw_ptr, - D.conv_desc.desc, algo, &beta, D.grad_desc.desc, args.grad_tensor->raw_ptr, - args.workspace.raw_ptr, args.workspace.size); + args.diff_tensor->raw_ptr(), D.src_desc.desc, args.src_tensor->raw_ptr(), + D.conv_desc.desc, algo, &beta, D.grad_desc.desc, + args.grad_tensor->raw_ptr(), args.workspace.raw_ptr, args.workspace.size); megdnn_assert( status == miopenStatusSuccess, "conv bwd_filter failed: %s; info: %s", miopenGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/rocm/convolution/forward/1x1.cpp b/dnn/src/rocm/convolution/forward/1x1.cpp index 70ef1f16..2349fcaf 100644 --- a/dnn/src/rocm/convolution/forward/1x1.cpp +++ b/dnn/src/rocm/convolution/forward/1x1.cpp @@ -57,17 +57,17 @@ size_t ConvolutionForwardImpl::Algo1x1::get_workspace_in_bytes( void ConvolutionForwardImpl::Algo1x1::exec(const ExecArgs& args) const { TensorND A, B, C; extract_matmul_layouts(args, A.layout, B.layout, C.layout); - A.raw_ptr = args.filter_tensor->raw_ptr; - B.raw_ptr = args.src_tensor->raw_ptr; - C.raw_ptr = args.dst_tensor->raw_ptr; + A.reset_ptr(args.filter_tensor->raw_ptr()); + B.reset_ptr(args.src_tensor->raw_ptr()); + C.reset_ptr(args.dst_tensor->raw_ptr()); size_t batch = args.src_layout->shape[0]; auto mm = args.handle->matmul_opr(); auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(), strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size(); for (size_t i = 0; i < batch; ++i) { mm->exec(A, B, C, args.workspace); - incr_voidp(B.raw_ptr, strd_B); - incr_voidp(C.raw_ptr, strd_C); + incr_refp(B.get_ref_ptr(), strd_B); + incr_refp(C.get_ref_ptr(), strd_C); } } @@ -118,9 +118,9 @@ size_t ConvolutionForwardImpl::Algo1x1LargeBatch::get_workspace_in_bytes( void ConvolutionForwardImpl::Algo1x1LargeBatch::exec(const ExecArgs& args) const { TensorND A, B, C; extract_matmul_layouts(args, A.layout, B.layout, C.layout); - A.raw_ptr = args.filter_tensor->raw_ptr; - B.raw_ptr = args.src_tensor->raw_ptr; - C.raw_ptr = args.dst_tensor->raw_ptr; + A.reset_ptr(args.filter_tensor->raw_ptr()); + B.reset_ptr(args.src_tensor->raw_ptr()); + C.reset_ptr(args.dst_tensor->raw_ptr()); auto mm = args.handle->batched_matrix_mul(); mm->exec(A, B, C, args.workspace); } diff --git a/dnn/src/rocm/convolution/forward/matmul.cpp b/dnn/src/rocm/convolution/forward/matmul.cpp index 9bbfbeac..52d1103c 100644 --- a/dnn/src/rocm/convolution/forward/matmul.cpp +++ b/dnn/src/rocm/convolution/forward/matmul.cpp @@ -65,7 +65,7 @@ void ConvolutionForwardImpl::AlgoMatmul::exec_internal(const ExecArgs& args) { Cl({OC, OH * OW * N}, typename DTypeTrait::dtype()); TensorND A(args.filter_tensor->ptr(), Al), B(col, Bl), C(dst_t, Cl); if (fm.should_flip) { - convolution::flip_filter(args, wbundle.get_workspace(2), A.raw_ptr); + convolution::flip_filter(args, wbundle.get_workspace(2), A.get_ref_ptr()); } args.handle->matmul_opr()->exec(A, B, C, Workspace()); TensorLayout C2l({OC * OH * OW, N}, typename DTypeTrait::dtype()), C3l = C2l; diff --git a/dnn/src/rocm/convolution/forward/miopen.cpp b/dnn/src/rocm/convolution/forward/miopen.cpp index 611f295b..46a1ed6c 100644 --- a/dnn/src/rocm/convolution/forward/miopen.cpp +++ b/dnn/src/rocm/convolution/forward/miopen.cpp @@ -76,10 +76,10 @@ miopenConvFwdAlgorithm_t ConvolutionForwardImpl::AlgoMIOpen::find_best_algo( int ret_algo_count; miopenConvAlgoPerf_t algo_perf; miopen_check(miopenFindConvolutionForwardAlgorithm( - args.handle->miopen_handle(), D.src_desc.desc, args.src_tensor->raw_ptr, - D.filter_desc.desc, args.filter_tensor->raw_ptr, D.conv_desc.desc, - D.dst_desc.desc, args.dst_tensor->raw_ptr, req_algo_count, &ret_algo_count, - &algo_perf, args.workspace.raw_ptr, args.workspace.size, + args.handle->miopen_handle(), D.src_desc.desc, args.src_tensor->raw_ptr(), + D.filter_desc.desc, args.filter_tensor->raw_ptr(), D.conv_desc.desc, + D.dst_desc.desc, args.dst_tensor->raw_ptr(), req_algo_count, + &ret_algo_count, &algo_perf, args.workspace.raw_ptr, args.workspace.size, exhaustive_search)); sm_miopen_algo_cache.set(args, algo_perf.fwd_algo); return algo_perf.fwd_algo; @@ -93,9 +93,10 @@ void ConvolutionForwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const { float alpha = 1.0f, beta = 0.0f; auto status = miopenConvolutionForward( args.handle->miopen_handle(), &alpha, D.src_desc.desc, - args.src_tensor->raw_ptr, D.filter_desc.desc, args.filter_tensor->raw_ptr, - D.conv_desc.desc, algo, &beta, D.dst_desc.desc, args.dst_tensor->raw_ptr, - args.workspace.raw_ptr, args.workspace.size); + args.src_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.desc, algo, &beta, + D.dst_desc.desc, args.dst_tensor->raw_ptr(), args.workspace.raw_ptr, + args.workspace.size); megdnn_assert( status == miopenStatusSuccess, "conv fwd failed: %s; info: %s", miopenGetErrorString(status), args.to_string().c_str()); diff --git a/dnn/src/rocm/convolution/helper.cpp b/dnn/src/rocm/convolution/helper.cpp index c25d47b2..c0df744f 100644 --- a/dnn/src/rocm/convolution/helper.cpp +++ b/dnn/src/rocm/convolution/helper.cpp @@ -84,19 +84,19 @@ WorkspaceBundle convolution::matmul_get_workspace_bundle(const ForwardSizeArgs& } void convolution::flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr) { + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr) { auto&& fm = args.filter_meta; megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; auto dtype = fm.dtype; megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); - TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, + TensorND src{{{OC, IC, FH, FW}, dtype}, ref_ptr}, dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; dst.layout.stride[2] = -dst.layout.stride[2]; dst.layout.stride[3] = -dst.layout.stride[3]; args.handle->relayout_opr()->exec(src, dst); - raw_ptr = workspace.raw_ptr; + ref_ptr.reset(workspace.raw_ptr); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/rocm/convolution/helper.h b/dnn/src/rocm/convolution/helper.h index 44ef7310..5961461f 100644 --- a/dnn/src/rocm/convolution/helper.h +++ b/dnn/src/rocm/convolution/helper.h @@ -68,7 +68,7 @@ WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs& args); * change \p raw_ptr to workspace. * */ void flip_filter( - const ForwardSizeArgs& args, const Workspace& workspace, void*& raw_ptr); + const ForwardSizeArgs& args, const Workspace& workspace, RefPtr& ref_ptr); struct MIOpenForwardDescs { TensorDesc src_desc, filter_desc, dst_desc; diff --git a/dnn/src/rocm/matrix_mul/blas.cpp b/dnn/src/rocm/matrix_mul/blas.cpp index d6bd1b87..238d2715 100644 --- a/dnn/src/rocm/matrix_mul/blas.cpp +++ b/dnn/src/rocm/matrix_mul/blas.cpp @@ -77,10 +77,11 @@ void MatrixMulForwardImpl::AlgoBlas::exec(const ExecArgs& args) const { : rocblas_operation_none, args.opr->param().transposeA ? rocblas_operation_transpose : rocblas_operation_none, - n, m, k, one, args.tensor_b.raw_ptr, rocblas_datatype_f16_r, - args.layout_b.stride[0], args.tensor_a.raw_ptr, rocblas_datatype_f16_r, - args.layout_a.stride[0], zero, args.tensor_c.raw_ptr, - rocblas_datatype_f16_r, args.layout_c.stride[0], args.tensor_c.raw_ptr, + n, m, k, one, args.tensor_b.raw_ptr(), rocblas_datatype_f16_r, + args.layout_b.stride[0], args.tensor_a.raw_ptr(), + rocblas_datatype_f16_r, args.layout_a.stride[0], zero, + args.tensor_c.raw_ptr(), rocblas_datatype_f16_r, + args.layout_c.stride[0], args.tensor_c.raw_ptr(), rocblas_datatype_f16_r, args.layout_c.stride[0], rocblas_datatype_f32_r, rocblas_gemm_algo_standard, solution_index, flags, &ws_size, nullptr); rocblas_check(gemm_ex_err); @@ -97,12 +98,12 @@ void MatrixMulForwardImpl::AlgoBlas::exec(const ExecArgs& args) const { args.opr->param().transposeA ? rocblas_operation_transpose : rocblas_operation_none, n, m, k, reinterpret_cast(one_half), - static_cast(args.tensor_b.raw_ptr), + static_cast(args.tensor_b.raw_ptr()), args.layout_b.stride[0], - static_cast(args.tensor_a.raw_ptr), + static_cast(args.tensor_a.raw_ptr()), args.layout_a.stride[0], reinterpret_cast(zero_half), - static_cast(args.tensor_c.raw_ptr), + static_cast(args.tensor_c.raw_ptr()), args.layout_c.stride[0]); rocblas_check(hgemm_err); }; @@ -145,11 +146,12 @@ void MatrixMulForwardImpl::AlgoBlas::exec(const ExecArgs& args) const { : rocblas_operation_none, args.opr->param().transposeA ? rocblas_operation_transpose : rocblas_operation_none, - n, m, k, one, args.tensor_b.raw_ptr, rocblas_datatype_i8_r, - args.layout_b.stride[0], args.tensor_a.raw_ptr, rocblas_datatype_i8_r, - args.layout_a.stride[0], zero, args.tensor_c.raw_ptr, - rocblas_datatype_i32_r, args.layout_c.stride[0], args.tensor_c.raw_ptr, - rocblas_datatype_i32_r, args.layout_c.stride[0], rocblas_datatype_i32_r, + n, m, k, one, args.tensor_b.raw_ptr(), rocblas_datatype_i8_r, + args.layout_b.stride[0], args.tensor_a.raw_ptr(), rocblas_datatype_i8_r, + args.layout_a.stride[0], zero, args.tensor_c.raw_ptr(), + rocblas_datatype_i32_r, args.layout_c.stride[0], + args.tensor_c.raw_ptr(), rocblas_datatype_i32_r, + args.layout_c.stride[0], rocblas_datatype_i32_r, rocblas_gemm_algo_standard, solution_index, flags, &ws_size, nullptr)); MEGDNN_MARK_USED_VAR(ws_size); } diff --git a/dnn/src/rocm/param_pack/opr_impl.cpp b/dnn/src/rocm/param_pack/opr_impl.cpp index a398ac3a..2f4454ab 100644 --- a/dnn/src/rocm/param_pack/opr_impl.cpp +++ b/dnn/src/rocm/param_pack/opr_impl.cpp @@ -29,7 +29,7 @@ void ParamPackConcatImpl::exec_internal( size_t inp_size = srcs.layout.shape[0], out_size = dst.layout.total_nr_elems(); auto stream = hip_stream(this->handle()); - auto src_cpu = static_cast(srcs.raw_ptr); + auto src_cpu = static_cast(srcs.raw_ptr()); megdnn_assert_internal(src_cpu); auto src_gpu = reinterpret_cast(workspace.raw_ptr); diff --git a/dnn/src/rocm/pooling/algo.cpp b/dnn/src/rocm/pooling/algo.cpp index 6641bff0..3aeeee5a 100644 --- a/dnn/src/rocm/pooling/algo.cpp +++ b/dnn/src/rocm/pooling/algo.cpp @@ -93,8 +93,8 @@ void PoolingForwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const { dt_float32 alpha = 1.0f, beta = 0.0f; miopen_check(miopenPoolingForward( - handle, miopen_desc, &alpha, src_desc.desc, args.src_tensor->raw_ptr, &beta, - dst_desc.desc, args.dst_tensor->raw_ptr, false, nullptr, 0_z)); + handle, miopen_desc, &alpha, src_desc.desc, args.src_tensor->raw_ptr(), + &beta, dst_desc.desc, args.dst_tensor->raw_ptr(), false, nullptr, 0_z)); miopen_check(miopenDestroyPoolingDescriptor(miopen_desc)); } @@ -189,13 +189,13 @@ void PoolingBackwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const { //! of the forward opr which stored in workspace. We have to recompute //! the indices by calling miopenPoolingForward again. miopen_check(miopenPoolingForward( - handle, miopen_desc, &alpha, src_desc.desc, args.src_tensor->raw_ptr, - &beta, dst_desc.desc, args.dst_tensor->raw_ptr, true, + handle, miopen_desc, &alpha, src_desc.desc, args.src_tensor->raw_ptr(), + &beta, dst_desc.desc, args.dst_tensor->raw_ptr(), true, args.workspace.raw_ptr, args.workspace.size)); } miopen_check(miopenPoolingBackward( - handle, miopen_desc, &alpha, dst_desc.desc, args.dst_tensor->raw_ptr, - diff_desc.desc, args.diff_tensor->raw_ptr, src_desc.desc, - args.src_tensor->raw_ptr, &beta, grad_desc.desc, args.grad_tensor->raw_ptr, - args.workspace.raw_ptr)); + handle, miopen_desc, &alpha, dst_desc.desc, args.dst_tensor->raw_ptr(), + diff_desc.desc, args.diff_tensor->raw_ptr(), src_desc.desc, + args.src_tensor->raw_ptr(), &beta, grad_desc.desc, + args.grad_tensor->raw_ptr(), args.workspace.raw_ptr)); } \ No newline at end of file diff --git a/dnn/src/rocm/reduce/opr_impl.cpp b/dnn/src/rocm/reduce/opr_impl.cpp index c9cd41e5..9530f641 100644 --- a/dnn/src/rocm/reduce/opr_impl.cpp +++ b/dnn/src/rocm/reduce/opr_impl.cpp @@ -16,7 +16,7 @@ #include "src/rocm/handle.h" #include "src/rocm/utils.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" namespace { @@ -131,10 +131,10 @@ namespace rocm { void ReduceForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { - using namespace reduce; + using namespace device_reduce; check_exec(src.layout, dst.layout, workspace.size); size_t A, B, C; - get_ABC(src.layout, A, B, C, param().axis); + reduce::get_ABC(src.layout, A, B, C, param().axis); auto stream = hip_stream(this->handle()); #define CASE(_mode, _op) \ case _mode: \ @@ -158,9 +158,9 @@ size_t ReduceForwardImpl::get_workspace_in_bytes( megdnn_assert( param().data_type != Reduce::DataType::FLOAT_IO16xC32, "FLOAT_IO16xC32 is deprecated"); - using namespace reduce; + using namespace device_reduce; size_t A, B, C; - get_ABC(src, A, B, C, param().axis); + reduce::get_ABC(src, A, B, C, param().axis); #define CASE(_mode, _op) \ case _mode: { \ return dispatch_dtype_workspace<_op>(src, dst, A, B, C, param().data_type); \ diff --git a/dnn/src/rocm/reduce/reduce.cpp.hip b/dnn/src/rocm/reduce/reduce.cpp.hip index 405eb134..a6f8d59a 100644 --- a/dnn/src/rocm/reduce/reduce.cpp.hip +++ b/dnn/src/rocm/reduce/reduce.cpp.hip @@ -11,7 +11,7 @@ */ #include "hcc_detail/hcc_defs_prologue.h" #include "hip_header.h" -#include "src/common/reduce_helper.h" +#include "src/common/reduce_helper_device.h" #include "megdnn/dtype.h" #include "src/rocm/reduce_helper.h.hip" @@ -19,7 +19,7 @@ namespace megdnn { namespace rocm { -using namespace reduce; +using namespace device_reduce; #define COMMA , diff --git a/dnn/src/rocm/relayout/opr_impl.cpp b/dnn/src/rocm/relayout/opr_impl.cpp index e257086a..ffb727bc 100644 --- a/dnn/src/rocm/relayout/opr_impl.cpp +++ b/dnn/src/rocm/relayout/opr_impl.cpp @@ -33,7 +33,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() { if (lsrc.stride[0] != 1 || ldst.stride[0] != 1) return false; hip_check(hipMemcpyAsync( - m_dst.raw_ptr, m_src.raw_ptr, ldst.total_nr_elems() * dtype_size(), + m_dst.raw_ptr(), m_src.raw_ptr(), ldst.total_nr_elems() * dtype_size(), hipMemcpyDeviceToDevice, m_opr->stream())); return true; } @@ -82,7 +82,7 @@ bool RelayoutForwardImpl::Param::try_copy_2d() { //! TODO: need refactor, hipMemcpy2DAsync has bug auto dsize = dtype_size(); hip_check(hipMemcpy2DAsync( - m_dst.raw_ptr, ldst.stride[0] * dsize, m_src.raw_ptr, + m_dst.raw_ptr(), ldst.stride[0] * dsize, m_src.raw_ptr(), lsrc.stride[0] * dsize, ldst.shape[1] * dsize, ldst.shape[0], hipMemcpyDeviceToDevice, m_opr->stream())); diff --git a/dnn/src/x86/add_update/opr_impl.cpp b/dnn/src/x86/add_update/opr_impl.cpp index cd32bb74..bfb67e15 100644 --- a/dnn/src/x86/add_update/opr_impl.cpp +++ b/dnn/src/x86/add_update/opr_impl.cpp @@ -65,7 +65,8 @@ void AddUpdateImpl::exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) { dest.layout.is_contiguous() && delta.layout.eq_shape(dest.layout) && dest.layout.dtype == delta.layout.dtype) { if (dest.layout.dtype == ::megdnn::dtype::Float32()) { - MEGDNN_DISPATCH_CPU_KERN_OPR(add_update_fp32_fma(dest, delta, m_param)); + auto param = m_param; + MEGDNN_DISPATCH_CPU_KERN_OPR(add_update_fp32_fma(dest, delta, param)); return; } } diff --git a/dnn/src/x86/conv_bias/f32/algos.cpp b/dnn/src/x86/conv_bias/f32/algos.cpp index 54cf7866..33f3d333 100644 --- a/dnn/src/x86/conv_bias/f32/algos.cpp +++ b/dnn/src/x86/conv_bias/f32/algos.cpp @@ -500,7 +500,7 @@ static inline void mkldnn_fp32_conv_instance( dnnl::memory::format_tag::nchw); } auto user_src_mem = - dnnl::memory(user_src_desc, eng_mkldnn, const_cast(param.src_ptr)); + dnnl::memory(user_src_desc, eng_mkldnn, param.src_ptr.get_ptr()); auto weight_tag = dnnl::memory::format_tag::OIhw8i8o; if (group > 1) { @@ -517,8 +517,8 @@ static inline void mkldnn_fp32_conv_instance( auto user_weights_desc = dnnl::memory::desc( {weight_shape}, dnnl::memory::data_type::f32, weight_tag); - auto user_weights_mem = dnnl::memory( - user_weights_desc, eng_mkldnn, const_cast(param.filter_ptr)); + auto user_weights_mem = + dnnl::memory(user_weights_desc, eng_mkldnn, param.filter_ptr.get_ptr()); auto user_bias_desc = dnnl::memory::desc(); if (param.bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) { user_bias_desc = dnnl::memory::desc( @@ -526,12 +526,12 @@ static inline void mkldnn_fp32_conv_instance( dnnl::memory::format_tag::x); } auto user_bias_mem = - dnnl::memory(user_bias_desc, eng_mkldnn, const_cast(param.bias_ptr)); + dnnl::memory(user_bias_desc, eng_mkldnn, param.bias_ptr.get_ptr()); auto user_dst_desc = dnnl::memory::desc( {dst_shape}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nChw8c); auto user_dst_mem = - dnnl::memory(user_dst_desc, eng_mkldnn, const_cast(param.dst_ptr)); + dnnl::memory(user_dst_desc, eng_mkldnn, param.dst_ptr.get_ptr()); auto conv_desc = dnnl::convolution_forward::desc( dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_auto, user_src_mem.get_desc(), user_weights_mem.get_desc(), @@ -599,9 +599,9 @@ struct NCBKernParamEqual { struct NCBKernParamHash { std::size_t operator()(const fallback::ConvBiasImpl::NCBKernParam& param) const { - std::size_t result = reinterpret_cast(param.filter_ptr); - result = result ^ (reinterpret_cast(param.src_ptr) << 3); - result = result ^ (reinterpret_cast(param.dst_ptr) << 7); + std::size_t result = reinterpret_cast(param.filter_ptr.get_ptr()); + result = result ^ (reinterpret_cast(param.src_ptr.get_ptr()) << 3); + result = result ^ (reinterpret_cast(param.dst_ptr.get_ptr()) << 7); result = result ^ (static_cast(param.n) << 11); return result; }; @@ -669,14 +669,14 @@ void ConvBiasImpl::AlgoMkldnnConv::kern_mkldnn_fp32( *do not need any bias op **/ PostProcess::run( - param.dst_ptr, const_cast(param.bias_ptr), param.dst_ptr, - megdnn::BiasMode::NO_BIAS, param.nonlineMode, param.bias_type, - param.dst_type, in, oc, oh, ow); + param.dst_ptr.get_ptr(), param.bias_ptr.get_ptr(), + param.dst_ptr.get_ptr(), megdnn::BiasMode::NO_BIAS, param.nonlineMode, + param.bias_type, param.dst_type, in, oc, oh, ow); } else if (param.bias_mode == megdnn::BiasMode::BIAS) { PostProcess::run( - param.dst_ptr, const_cast(param.bias_ptr), param.dst_ptr, - param.bias_mode, param.nonlineMode, param.bias_type, param.dst_type, in, - oc, oh, ow); + param.dst_ptr.get_ptr(), param.bias_ptr.get_ptr(), + param.dst_ptr.get_ptr(), param.bias_mode, param.nonlineMode, + param.bias_type, param.dst_type, in, oc, oh, ow); } } #endif diff --git a/dnn/src/x86/conv_bias/int8/algos.cpp b/dnn/src/x86/conv_bias/int8/algos.cpp index f82bbbf6..77f55b0b 100644 --- a/dnn/src/x86/conv_bias/int8/algos.cpp +++ b/dnn/src/x86/conv_bias/int8/algos.cpp @@ -530,11 +530,11 @@ void ConvBiasImpl::AlgoMkldnnMatmulQint8::kern_mkldnn_matmul_s8x8x32( { TensorND A_, B_, C_; A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Int8()); - A_.raw_ptr = const_cast(param.filter(group_id)); + A_.reset_ptr(const_cast(param.filter(group_id))); B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Int8()); - B_.raw_ptr = B; + B_.reset_ptr(B); C_.layout = TensorLayout({OC, OH * OW}, dtype::Int32()); - C_.raw_ptr = dst; + C_.reset_ptr(dst); Workspace workspace( static_cast(bundle.get(2)), bundle.get_size(2)); get_matmul_opr()->exec(A_, B_, C_, workspace); diff --git a/dnn/src/x86/cvt_color/opr_impl.cpp b/dnn/src/x86/cvt_color/opr_impl.cpp index f50d214b..1d963944 100644 --- a/dnn/src/x86/cvt_color/opr_impl.cpp +++ b/dnn/src/x86/cvt_color/opr_impl.cpp @@ -1833,12 +1833,11 @@ void cvt_bt601_yuv( template void CvtColorImpl::cvt_color_exec( - _megdnn_tensor_in src_tensor, _megdnn_tensor_out dst_tensor) { - auto mode = param().mode; + _megdnn_tensor_in src_tensor, _megdnn_tensor_out dst_tensor, Param::Mode mode) { for (size_t i = 0; i < src_tensor.layout.shape[0]; ++i) { Mat src = TensorND2Mat(src_tensor, i); Mat dst = TensorND2Mat(dst_tensor, i); - switch (param().mode) { + switch (mode) { case Param::Mode::RGB2GRAY: cvt_rgb2gray(src, dst); break; @@ -1926,11 +1925,12 @@ void CvtColorImpl::exec( naive::CvtColorImpl::exec(src, dst, workspace); return; } + auto mode = this->param().mode; MEGDNN_DISPATCH_CPU_KERN_OPR( if (dst.layout.dtype == dtype::Float32()) { - cvt_color_exec(src, dst); + cvt_color_exec(src, dst, mode); } else if (dst.layout.dtype == dtype::Uint8()) { - cvt_color_exec(src, dst); + cvt_color_exec(src, dst, mode); } else { megdnn_throw("Unsupported datatype of CvtColor optr."); }); } diff --git a/dnn/src/x86/cvt_color/opr_impl.h b/dnn/src/x86/cvt_color/opr_impl.h index c64d2d6f..1eeb23e3 100644 --- a/dnn/src/x86/cvt_color/opr_impl.h +++ b/dnn/src/x86/cvt_color/opr_impl.h @@ -18,7 +18,8 @@ namespace x86 { class CvtColorImpl : public naive::CvtColorImpl { private: template - void cvt_color_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst); + void cvt_color_exec( + _megdnn_tensor_in src, _megdnn_tensor_out dst, Param::Mode mode); public: using naive::CvtColorImpl::CvtColorImpl; diff --git a/dnn/src/x86/elemwise/opr_impl.cpp b/dnn/src/x86/elemwise/opr_impl.cpp index 96f7db9f..8a490467 100644 --- a/dnn/src/x86/elemwise/opr_impl.cpp +++ b/dnn/src/x86/elemwise/opr_impl.cpp @@ -57,9 +57,11 @@ void check_mkl_error(const char* func) { } // namespace #if MEGDNN_X86_WITH_MKL -#define DISPATCH_MKL(_mode, _func) \ - case Mode::_mode: \ - MEGDNN_DISPATCH_CPU_KERN_OPR(_func(n, sptr, dptr); check_mkl_error(#_func)); \ +#define DISPATCH_MKL(_mode, _func) \ + case Mode::_mode: \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + _func(n, src.ptr(), dst.ptr()); \ + check_mkl_error(#_func)); \ return true #endif @@ -84,15 +86,15 @@ void check_mkl_error(const char* func) { } while (0) bool ElemwiseImpl::exec_unary() { -#define DISPATCH_UNARY(_mode, _type, _simd_type, _op) \ - case Mode::_mode: { \ - thin_function run = \ - OpCallerUnary<_op<_simd_type, _type, _type>, _simd_type>::run; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast<_type*>(dst_tensor.raw_ptr), src0.layout.dtype, \ - dst_tensor.layout.dtype, nr_elems)); \ - return true; \ +#define DISPATCH_UNARY(_mode, _type, _simd_type, _op) \ + case Mode::_mode: { \ + thin_function run = \ + OpCallerUnary<_op<_simd_type, _type, _type>, _simd_type>::run; \ + MEGDNN_DISPATCH_CPU_KERN_OPR( \ + run(static_cast(src0.raw_ptr()), \ + static_cast<_type*>(dst_tensor.raw_ptr()), src0.layout.dtype, \ + dst_tensor.layout.dtype, nr_elems)); \ + return true; \ } if (m_src->size() != 1) @@ -141,8 +143,8 @@ bool ElemwiseImpl::exec_unary() { #if MEGDNN_X86_WITH_MKL if (m_dst->layout.dtype == dtype::Float32()) { - auto n = elparam[0].layout.shape[0]; - auto sptr = elparam[0].ptr(), dptr = m_dst->ptr(); + auto n = src0.layout.shape[0]; + auto src = src0, dst = dst_tensor; auto mkl_dispatch = [&]() { switch (param().mode) { @@ -214,9 +216,9 @@ bool ElemwiseImpl::exec_binary() { run = OpCallerBinary< \ _op<_simd_type, _type, _type>, _simd_type, VEC_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR(run( \ - static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, src0.layout.total_nr_elems())); \ return true; \ } @@ -234,9 +236,9 @@ bool ElemwiseImpl::exec_binary() { run = OpCallerBinary< \ _op<_simd_type, _type, _type>, _simd_type, VEC_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR(run( \ - static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, src0.layout.total_nr_elems())); \ return true; \ } @@ -264,9 +266,9 @@ bool ElemwiseImpl::exec_binary() { run = OpCallerBinary< \ _op<_simd_type, _type, _type>, _simd_type, SCALAR_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR(run( \ - static_cast(src0.raw_ptr)[0], \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + static_cast(src0.raw_ptr())[0], \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, src1.layout.total_nr_elems())); \ return true; \ } @@ -289,9 +291,9 @@ bool ElemwiseImpl::exec_binary() { run = OpCallerBinary< \ _op<_simd_type, _type, _type>, _simd_type, VEC_BCAST101>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, binfo.z)); \ return true; \ } @@ -323,9 +325,9 @@ bool ElemwiseImpl::exec_binary() { run = OpCallerBinary< \ _op<_simd_type, _type, _type>, _simd_type, BCAST101_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, binfo.x, binfo.y, binfo.z)); \ return true; \ } @@ -347,9 +349,9 @@ bool ElemwiseImpl::exec_binary() { _op<_simd_type, _type, _type>, _simd_type, \ BCAST101x_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, dst.layout.dtype, batch_size, binfo.x, binfo.y, \ binfo.z)); \ return true; \ @@ -428,10 +430,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { run = OpCallerTernary< \ _op<_simd_type, _type, _type>, _simd_type, VEC_VEC_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ return true; \ @@ -457,10 +459,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { _op<_simd_type, _type, _type>, _simd_type, \ VEC_VEC_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ return true; \ @@ -488,10 +490,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { _op<_simd_type, _type, _type>, _simd_type, \ BCAST101_VEC_BCAST101>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, binfo.x, \ binfo.y, binfo.z)); \ return true; \ @@ -519,10 +521,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { _op<_simd_type, _type, _type>, _simd_type, \ VEC_BCAST101_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr), \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr()), \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, binfo.x, \ binfo.y, binfo.z)); \ return true; \ @@ -548,10 +550,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { _op<_simd_type, _type, _type>, _simd_type, \ VEC_SCALAR_VEC>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast(src2.raw_ptr), \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast(src2.raw_ptr()), \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ return true; \ @@ -577,10 +579,10 @@ bool ElemwiseImpl::exec_ternary_fma3() { _op<_simd_type, _type, _type>, _simd_type, \ VEC_SCALAR_SCALAR>::run; \ MEGDNN_DISPATCH_CPU_KERN_OPR( \ - run(static_cast(src0.raw_ptr), \ - static_cast(src1.raw_ptr)[0], \ - static_cast(src2.raw_ptr)[0], \ - static_cast<_type*>(dst.raw_ptr), src0.layout.dtype, \ + run(static_cast(src0.raw_ptr()), \ + static_cast(src1.raw_ptr())[0], \ + static_cast(src2.raw_ptr())[0], \ + static_cast<_type*>(dst.raw_ptr()), src0.layout.dtype, \ src1.layout.dtype, src2.layout.dtype, dst.layout.dtype, \ src0.layout.total_nr_elems())); \ return true; \ diff --git a/dnn/src/x86/gaussian_blur/opr_impl.cpp b/dnn/src/x86/gaussian_blur/opr_impl.cpp index 34573810..0b3ef66e 100644 --- a/dnn/src/x86/gaussian_blur/opr_impl.cpp +++ b/dnn/src/x86/gaussian_blur/opr_impl.cpp @@ -76,14 +76,14 @@ using BorderMode = param::GaussianBlur::BorderMode; template void GaussianBlurImpl::gaussian_blur_exec( - const TensorND& src_tensor, const TensorND& dst_tensor) { - Size ksize = Size(param().kernel_height, param().kernel_width); + const TensorND& src_tensor, const TensorND& dst_tensor, const Param& param) { + Size ksize = Size(param.kernel_height, param.kernel_width); Mat kernel_column(1, ksize.cols(), 1); Mat kernel_row(1, ksize.rows(), 1); gaussian_blur::createGaussianKernels( - kernel_column, kernel_row, ksize, param().sigma_x, param().sigma_y); + kernel_column, kernel_row, ksize, param.sigma_x, param.sigma_y); size_t src_channels = src_tensor.layout.shape[3]; T border_value[4] = {0, 0, 0, 0}; @@ -94,9 +94,9 @@ void GaussianBlurImpl::gaussian_blur_exec( BaseColumnFilter* column_filter = getLinearColumnFilter(kernel_row, (int)0); FilterEngine filter( - row_filter, column_filter, src_channels, border_value, param().border_mode); + row_filter, column_filter, src_channels, border_value, param.border_mode); - megdnn_assert(param().border_mode != BorderMode::BORDER_ISOLATED); + megdnn_assert(param.border_mode != BorderMode::BORDER_ISOLATED); for (size_t i = 0; i < src_tensor.layout.shape[0]; ++i) { Mat src = TensorND2Mat(src_tensor, i); Mat dst = TensorND2Mat(dst_tensor, i); @@ -106,15 +106,15 @@ void GaussianBlurImpl::gaussian_blur_exec( } void GaussianBlurImpl::gaussian_blur_exec_8u( - const TensorND& src_tensor, const TensorND& dst_tensor) { + const TensorND& src_tensor, const TensorND& dst_tensor, const Param& param) { megdnn_assert(src_tensor.layout.dtype == dtype::Uint8()); - Size ksize = Size(param().kernel_height, param().kernel_width); + Size ksize = Size(param.kernel_height, param.kernel_width); Mat kernel_column(1, ksize.cols(), 1); Mat kernel_row(1, ksize.rows(), 1); gaussian_blur::createGaussianKernels( - kernel_column, kernel_row, ksize, param().sigma_x, param().sigma_y); + kernel_column, kernel_row, ksize, param.sigma_x, param.sigma_y); size_t src_channels = src_tensor.layout.shape[3]; const uint8_t bits = 8; @@ -138,9 +138,9 @@ void GaussianBlurImpl::gaussian_blur_exec_8u( getLinearColumnFilter(kernel_row_int, bits * 2); FilterEngine filter( - rowFilter, columnFilter, src_channels, border_value, param().border_mode); + rowFilter, columnFilter, src_channels, border_value, param.border_mode); - megdnn_assert(param().border_mode != BorderMode::BORDER_ISOLATED); + megdnn_assert(param.border_mode != BorderMode::BORDER_ISOLATED); for (size_t i = 0; i < src_tensor.layout.shape[0]; ++i) { Mat src = TensorND2Mat(src_tensor, i); Mat dst = TensorND2Mat(dst_tensor, i); @@ -153,11 +153,12 @@ void GaussianBlurImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_workspace workspace) { using namespace megcv; check_exec(src.layout, dst.layout, workspace.size); + Param param = this->param(); MEGDNN_DISPATCH_CPU_KERN_OPR( if (dst.layout.dtype == dtype::Float32()) { - gaussian_blur_exec(src, dst); + gaussian_blur_exec(src, dst, param); } else if (dst.layout.dtype == dtype::Uint8()) { - gaussian_blur_exec_8u(src, dst); + gaussian_blur_exec_8u(src, dst, param); } else { megdnn_throw("Unsupported datatype of GaussianBlur optr."); }); } diff --git a/dnn/src/x86/gaussian_blur/opr_impl.h b/dnn/src/x86/gaussian_blur/opr_impl.h index de35b900..b721c3e4 100644 --- a/dnn/src/x86/gaussian_blur/opr_impl.h +++ b/dnn/src/x86/gaussian_blur/opr_impl.h @@ -21,8 +21,10 @@ namespace x86 { class GaussianBlurImpl : public GaussianBlur { private: template - void gaussian_blur_exec(const TensorND& src_tensor, const TensorND& dst_tensor); - void gaussian_blur_exec_8u(const TensorND& src_tensor, const TensorND& dst_tensor); + void gaussian_blur_exec( + const TensorND& src_tensor, const TensorND& dst_tensor, const Param& param); + void gaussian_blur_exec_8u( + const TensorND& src_tensor, const TensorND& dst_tensor, const Param& param); template void createGaussianKernels( @@ -31,6 +33,7 @@ private: public: using GaussianBlur::GaussianBlur; + using Param = param::GaussianBlur; size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { return 0; } diff --git a/dnn/src/x86/lrn/opr_impl.cpp b/dnn/src/x86/lrn/opr_impl.cpp index 991e6aae..340f7a02 100644 --- a/dnn/src/x86/lrn/opr_impl.cpp +++ b/dnn/src/x86/lrn/opr_impl.cpp @@ -87,7 +87,6 @@ void LRNImpl::exec( check_exec(src.layout, dst.layout, workspace.size); auto N = src.layout.shape[0], C = src.layout.shape[1], H = src.layout.shape[2], W = src.layout.shape[3]; - auto sptr_ = src.ptr(), dptr_ = dst.ptr(); std::function @@ -105,11 +104,12 @@ void LRNImpl::exec( auto k = param().k; auto alpha = param().alpha; auto beta = param().beta; - MEGDNN_DISPATCH_CPU_KERN_OPR(auto sptr = sptr_; auto dptr = dptr_; rep(i, N) { - f(sptr, dptr, C, H, W, n, k, alpha, beta); - sptr += C * H * W; - dptr += C * H * W; - }); + MEGDNN_DISPATCH_CPU_KERN_OPR(auto sptr = src.ptr(); + auto dptr = dst.ptr(); rep(i, N) { + f(sptr, dptr, C, H, W, n, k, alpha, beta); + sptr += C * H * W; + dptr += C * H * W; + }); } } // namespace x86 diff --git a/dnn/src/x86/matrix_mul/algos.cpp b/dnn/src/x86/matrix_mul/algos.cpp index a357878c..1e6aa298 100644 --- a/dnn/src/x86/matrix_mul/algos.cpp +++ b/dnn/src/x86/matrix_mul/algos.cpp @@ -203,9 +203,9 @@ void int8x8x32_kern_mkldnn(const MatrixMulImpl::KernParam& kern_param) { const float alpha = 1.0f, beta = 0.0f; const int8_t ao = 0, bo = 0; const int32_t co = 0; - const int8_t* A_ptr = static_cast(kern_param.A_ptr); - const int8_t* B_ptr = static_cast(kern_param.B_ptr); - int32_t* C_ptr = static_cast(kern_param.C_ptr); + const int8_t* A_ptr = static_cast(kern_param.A_ptr.get_ptr()); + const int8_t* B_ptr = static_cast(kern_param.B_ptr.get_ptr()); + int32_t* C_ptr = static_cast(kern_param.C_ptr.get_ptr()); auto status = mkldnn_gemm_s8s8s32( transA, transB, offsetC, M, N, K, alpha, A_ptr, LDA, ao, B_ptr, LDB, bo, beta, C_ptr, LDC, &co); diff --git a/dnn/src/x86/pooling/algo.cpp b/dnn/src/x86/pooling/algo.cpp index dc04e79e..f0e3d6fb 100644 --- a/dnn/src/x86/pooling/algo.cpp +++ b/dnn/src/x86/pooling/algo.cpp @@ -51,8 +51,7 @@ dnnl::memory tensor_to_mkl_memory( auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng); return megdnn_src_memory; } else { - auto megdnn_src_memory = - dnnl::memory(megdnn_src_md, mkldnn_eng, const_cast(src.raw_ptr)); + auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng, src.raw_ptr()); return megdnn_src_memory; } } @@ -92,8 +91,8 @@ PoolingImpl::AlgoBase::ExecArgs::ExecArgs( PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) : SizeArgs(opr, src.layout, dst.layout), - src_tensor{&src}, - dst_tensor{&dst}, + src_tensor{src}, + dst_tensor{dst}, workspace{workspace} {} std::string PoolingImpl::AlgoBase::SizeArgs::to_string() const { @@ -123,14 +122,16 @@ void PoolingImpl::AlgoMeanW2S2AVX::exec(const ExecArgs& args) const { auto OW = args.layout_dst.shape[3]; auto PH = args.opr->param().pad_h; auto PW = args.opr->param().pad_w; - auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); - auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - mean_pooling_w2x2_s2x2_avx( - sptr + n * C * IH * IW + c * IH * IW, IH, IW, - dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW, true); - }); + MEGDNN_DISPATCH_CPU_KERN_OPR( + auto sptr = reinterpret_cast(args.src_tensor.raw_ptr()); + auto dptr = reinterpret_cast(args.dst_tensor.raw_ptr()); + rep(n, N) rep(c, C) { + mean_pooling_w2x2_s2x2_avx( + sptr + n * C * IH * IW + c * IH * IW, IH, IW, + dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW, true); + }); } bool PoolingImpl::AlgoMeanW2S2SSE3::is_available(const SizeArgs& args) const { @@ -154,14 +155,16 @@ void PoolingImpl::AlgoMeanW2S2SSE3::exec(const ExecArgs& args) const { auto OW = args.layout_dst.shape[3]; auto PH = args.opr->param().pad_h; auto PW = args.opr->param().pad_w; - auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); - auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - mean_pooling_w2x2_s2x2_sse3( - sptr + n * C * IH * IW + c * IH * IW, IH, IW, - dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW, true); - }); + MEGDNN_DISPATCH_CPU_KERN_OPR( + auto sptr = reinterpret_cast(args.src_tensor.raw_ptr()); + auto dptr = reinterpret_cast(args.dst_tensor.raw_ptr()); + rep(n, N) rep(c, C) { + mean_pooling_w2x2_s2x2_sse3( + sptr + n * C * IH * IW + c * IH * IW, IH, IW, + dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW, true); + }); } bool PoolingImpl::AlgoMaxW2S2SSE::is_available(const SizeArgs& args) const { @@ -185,14 +188,16 @@ void PoolingImpl::AlgoMaxW2S2SSE::exec(const ExecArgs& args) const { auto OW = args.layout_dst.shape[3]; auto PH = args.opr->param().pad_h; auto PW = args.opr->param().pad_w; - auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); - auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - max_pooling_w2x2_s2x2_sse( - sptr + n * C * IH * IW + c * IH * IW, IH, IW, - dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW); - }); + MEGDNN_DISPATCH_CPU_KERN_OPR( + auto sptr = reinterpret_cast(args.src_tensor.raw_ptr()); + auto dptr = reinterpret_cast(args.dst_tensor.raw_ptr()); + rep(n, N) rep(c, C) { + max_pooling_w2x2_s2x2_sse( + sptr + n * C * IH * IW + c * IH * IW, IH, IW, + dptr + n * C * OH * OW + c * OH * OW, OH, OW, PH, PW); + }); } bool PoolingImpl::AlgoMaxW3S3SSE::is_available(const SizeArgs& args) const { @@ -216,13 +221,14 @@ void PoolingImpl::AlgoMaxW3S3SSE::exec(const ExecArgs& args) const { auto OW = args.layout_dst.shape[3]; auto PH = args.opr->param().pad_h; auto PW = args.opr->param().pad_w; - auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); - auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); auto handle = [=]() { return args.handle; }; + WorkspaceBundle ws = + get_bundle(args.layout_src, args.layout_dst, args.opr->param()); + ws.set(args.workspace.raw_ptr); MEGDNN_DISPATCH_CPU_KERN_OPR( - WorkspaceBundle ws = - get_bundle(args.layout_src, args.layout_dst, args.opr->param()); - ws.set(args.workspace.raw_ptr); rep(n, N) rep(c, C) { + auto sptr = reinterpret_cast(args.src_tensor.raw_ptr()); + auto dptr = reinterpret_cast(args.dst_tensor.raw_ptr()); + rep(n, N) rep(c, C) { do_max_pooling_3x3_s2x2_float_SSE( sptr + n * C * IH * IW + c * IH * IW, dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, OW, PH, PW, @@ -255,32 +261,30 @@ void PoolingImpl::AlgoMKLDNNNCHW::exec(const ExecArgs& args) const { dnnl::memory::dims pool_padding = {PH, PW}; dnnl::memory::dims pool_kernel = {FH, FW}; - dnnl::memory&& megdnn_src_memory_ori = - tensor_to_mkl_memory( - *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); - dnnl::memory&& megdnn_dst_memory_ori = - tensor_to_mkl_memory( - *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); - - dnnl::memory&& megdnn_src_memory = - tensor_to_mkl_memory( - *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); - dnnl::memory&& megdnn_dst_memory = - tensor_to_mkl_memory( - *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); - - auto reorder_src = dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); - auto reorder_dst = dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); - auto pool1_desc = dnnl::pooling_forward::desc( - dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, - megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), pool_strides, - pool_kernel, pool_padding, pool_padding); - auto pool_pd = dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); - auto pool = dnnl::pooling_forward(pool_pd); - - auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst, - megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory, - megdnn_dst_memory_ori](void) { + auto run = [args, pool_strides, pool_padding, pool_kernel, mkldnn_eng, + mkldnn_stream, mkldnn_pooling_mode](void) { + dnnl::memory&& megdnn_src_memory_ori = + tensor_to_mkl_memory( + args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + dnnl::memory&& megdnn_dst_memory_ori = + tensor_to_mkl_memory( + args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + + dnnl::memory&& megdnn_src_memory = + tensor_to_mkl_memory( + args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + dnnl::memory&& megdnn_dst_memory = + tensor_to_mkl_memory( + args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + + auto reorder_src = dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); + auto reorder_dst = dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); + auto pool1_desc = dnnl::pooling_forward::desc( + dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, + megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), + pool_strides, pool_kernel, pool_padding, pool_padding); + auto pool_pd = dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); + auto pool = dnnl::pooling_forward(pool_pd); MEGDNN_MARK_USED_VAR(mkldnn_eng); auto mkl_stream = mkldnn_stream; reorder_src.execute( @@ -336,21 +340,20 @@ void PoolingImpl::AlgoMKLDNNNCHW88::exec(const ExecArgs& args) const { dnnl::memory::dims pool_strides = {SH, SW}; dnnl::memory::dims pool_padding = {PH, PW}; dnnl::memory::dims pool_kernel = {FH, FW}; - dnnl::memory&& megdnn_src_memory_ori = - tensor_to_mkl_memory( - *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::f32); - dnnl::memory&& megdnn_dst_memory_ori = - tensor_to_mkl_memory( - *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::f32); - auto pool_desc = dnnl::pooling_forward::desc( - dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, - megdnn_src_memory_ori.get_desc(), megdnn_dst_memory_ori.get_desc(), - pool_strides, pool_kernel, pool_padding, pool_padding); - auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); - auto pool = dnnl::pooling_forward(pool_pd); - - auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori, - megdnn_dst_memory_ori](void) { + auto run = [args, pool_strides, pool_padding, pool_kernel, mkldnn_eng, + mkldnn_stream, mkldnn_pooling_mode](void) { + dnnl::memory&& megdnn_src_memory_ori = + tensor_to_mkl_memory( + args.src_tensor, mkldnn_eng, dnnl::memory::data_type::f32); + dnnl::memory&& megdnn_dst_memory_ori = + tensor_to_mkl_memory( + args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::f32); + auto pool_desc = dnnl::pooling_forward::desc( + dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, + megdnn_src_memory_ori.get_desc(), megdnn_dst_memory_ori.get_desc(), + pool_strides, pool_kernel, pool_padding, pool_padding); + auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); + auto pool = dnnl::pooling_forward(pool_pd); MEGDNN_MARK_USED_VAR(mkldnn_eng); auto mkl_stream = mkldnn_stream; @@ -477,11 +480,10 @@ void PoolingImpl::AlgoMaxS1NCHW88AVX::exec(const ExecArgs& args) const { size_t IW = args.layout_src.shape[3]; size_t OH = args.layout_dst.shape[2]; size_t OW = args.layout_dst.shape[3]; - float* src_ptr = reinterpret_cast(args.src_tensor->raw_ptr); - float* dst_ptr = reinterpret_cast(args.dst_tensor->raw_ptr); - auto run = [IC, src_ptr, dst_ptr, IH, IW, OH, OW, PH, PW, WH, WW]( - size_t index, size_t) { + auto run = [args, IC, IH, IW, OH, OW, PH, PW, WH, WW](size_t index, size_t) { + float* src_ptr = reinterpret_cast(args.src_tensor.raw_ptr()); + float* dst_ptr = reinterpret_cast(args.dst_tensor.raw_ptr()); size_t n = index / IC; size_t c = index % IC; float* src = src_ptr + n * IH * IW * IC * VECSIZE + IH * IW * c * VECSIZE; diff --git a/dnn/src/x86/pooling/algo.h b/dnn/src/x86/pooling/algo.h index fbdfc680..a24de56a 100644 --- a/dnn/src/x86/pooling/algo.h +++ b/dnn/src/x86/pooling/algo.h @@ -48,7 +48,7 @@ public: SizeArgs(PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst); }; struct ExecArgs : public SizeArgs { - const TensorND *src_tensor, *dst_tensor; + const TensorND src_tensor, dst_tensor; Workspace workspace; ExecArgs( diff --git a/dnn/src/x86/resize/opr_impl.cpp b/dnn/src/x86/resize/opr_impl.cpp index 2bf3ff30..5906180c 100644 --- a/dnn/src/x86/resize/opr_impl.cpp +++ b/dnn/src/x86/resize/opr_impl.cpp @@ -32,14 +32,15 @@ using namespace x86; void ResizeImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); - if (param().format == param::Resize::Format::NCHW || + auto param = this->param(); + if (param.format == param::Resize::Format::NCHW || (src.layout[3] != 1 && src.layout[3] != 3) || !is_supported(SIMDType::SSE4_2) || !is_nhwc_contig_wc(src.layout)) { fallback::ResizeImpl::exec(src, dst, workspace); } else { megdnn_assert( - param().format == param::Resize::Format::NHWC, "invalid resize format"); - MEGDNN_DISPATCH_CPU_KERN_OPR(resize_cv_exec(src, dst, param().imode)); + param.format == param::Resize::Format::NHWC, "invalid resize format"); + MEGDNN_DISPATCH_CPU_KERN_OPR(resize_cv_exec(src, dst, param.imode)); } } diff --git a/dnn/src/x86/separable_conv/opr_impl.cpp b/dnn/src/x86/separable_conv/opr_impl.cpp index e122c527..3e954739 100644 --- a/dnn/src/x86/separable_conv/opr_impl.cpp +++ b/dnn/src/x86/separable_conv/opr_impl.cpp @@ -80,13 +80,11 @@ void SeparableConvImpl::exec( int oh = dst.layout.shape[2]; int ow = dst.layout.shape[3]; - filter_engine_ = new FilterEngine( + std::shared_ptr filter_engine = std::make_shared( ih, iw, oh, ow, param().ksize_h, param().ksize_w, param().anchor_h, param().anchor_w, param().borderMode, param().is_symm_kernel); - MEGDNN_DISPATCH_CPU_KERN_OPR(filter_engine_->exec(src, filter_x, filter_y, dst);); - - delete (filter_engine_); + MEGDNN_DISPATCH_CPU_KERN_OPR(filter_engine->exec(src, filter_x, filter_y, dst)); } } // namespace x86 diff --git a/dnn/src/x86/separable_conv/opr_impl.h b/dnn/src/x86/separable_conv/opr_impl.h index 08dac5c2..4c6790a6 100644 --- a/dnn/src/x86/separable_conv/opr_impl.h +++ b/dnn/src/x86/separable_conv/opr_impl.h @@ -29,7 +29,6 @@ public: // TODO: deduce the size of ring buffer. return 0; } - FilterEngine* filter_engine_; }; } // namespace x86 diff --git a/dnn/src/x86/separable_filter/opr_impl.cpp b/dnn/src/x86/separable_filter/opr_impl.cpp index 749f30fb..49f10999 100644 --- a/dnn/src/x86/separable_filter/opr_impl.cpp +++ b/dnn/src/x86/separable_filter/opr_impl.cpp @@ -25,13 +25,13 @@ using BorderMode = param::SeparableFilter::BorderMode; void SeparableFilterImpl::separable_filter_exec_8u( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y, - _megdnn_tensor_out dst) { + _megdnn_tensor_out dst, const Param& param) { megdnn_assert(src.layout.dtype == dtype::Uint8()); Mat kernel_column( - 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr)); + 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr())); Mat kernel_row( - 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr)); + 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr())); size_t src_channels = src.layout.shape[3]; @@ -53,7 +53,7 @@ void SeparableFilterImpl::separable_filter_exec_8u( using namespace gaussian_blur; BaseRowFilter* rowFilter = nullptr; BaseColumnFilter* columnFilter = nullptr; - if (param().is_symm_kernel) { + if (param.is_symm_kernel) { rowFilter = getLinearRowFilter(kernel_row_int); columnFilter = getLinearColumnFilter(kernel_column_int, bits * 2); @@ -64,9 +64,9 @@ void SeparableFilterImpl::separable_filter_exec_8u( } FilterEngine filter( - rowFilter, columnFilter, src_channels, border_value, param().borderMode); + rowFilter, columnFilter, src_channels, border_value, param.borderMode); - megdnn_assert(param().borderMode != BorderMode::BORDER_ISOLATED); + megdnn_assert(param.borderMode != BorderMode::BORDER_ISOLATED); for (size_t i = 0; i < src.layout.shape[0]; ++i) { Mat src_mat = TensorND2Mat(src, i); Mat dst_mat = TensorND2Mat(dst, i); @@ -78,18 +78,18 @@ void SeparableFilterImpl::separable_filter_exec_8u( template void SeparableFilterImpl::separable_filter_exec( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y, - _megdnn_tensor_out dst) { + _megdnn_tensor_out dst, const Param& param) { Mat kernel_column( - 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr)); + 1, filter_y.layout.shape[3], 1, static_cast(filter_y.raw_ptr())); Mat kernel_row( - 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr)); + 1, filter_x.layout.shape[3], 1, static_cast(filter_x.raw_ptr())); size_t src_channels = src.layout.shape[3]; T border_value[4] = {0, 0, 0, 0}; BaseRowFilter* row_filter = nullptr; BaseColumnFilter* column_filter = nullptr; - if (param().is_symm_kernel) { + if (param.is_symm_kernel) { row_filter = getLinearRowFilter(kernel_row); column_filter = getLinearColumnFilter(kernel_column, (int)0); } else { @@ -98,9 +98,9 @@ void SeparableFilterImpl::separable_filter_exec( } FilterEngine filter( - row_filter, column_filter, src_channels, border_value, param().borderMode); + row_filter, column_filter, src_channels, border_value, param.borderMode); - megdnn_assert(param().borderMode != BorderMode::BORDER_ISOLATED); + megdnn_assert(param.borderMode != BorderMode::BORDER_ISOLATED); for (size_t i = 0; i < src.layout.shape[0]; ++i) { Mat src_mat = TensorND2Mat(src, i); Mat dst_mat = TensorND2Mat(dst, i); @@ -113,12 +113,13 @@ void SeparableFilterImpl::exec( _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec( src.layout, filter_x.layout, filter_y.layout, dst.layout, workspace.size); + auto param = this->param(); if (dst.layout.dtype == dtype::Float32()) { MEGDNN_DISPATCH_CPU_KERN_OPR( - separable_filter_exec(src, filter_x, filter_y, dst)); + separable_filter_exec(src, filter_x, filter_y, dst, param)); } else if (dst.layout.dtype == dtype::Uint8()) { MEGDNN_DISPATCH_CPU_KERN_OPR( - separable_filter_exec_8u(src, filter_x, filter_y, dst)); + separable_filter_exec_8u(src, filter_x, filter_y, dst, param)); } else { megdnn_throw("Unsupported datatype of SeparableFilter opr."); }; diff --git a/dnn/src/x86/separable_filter/opr_impl.h b/dnn/src/x86/separable_filter/opr_impl.h index 1aec54c7..07b44ebf 100644 --- a/dnn/src/x86/separable_filter/opr_impl.h +++ b/dnn/src/x86/separable_filter/opr_impl.h @@ -15,6 +15,7 @@ namespace x86 { class SeparableFilterImpl : public SeparableFilterForward { public: using SeparableFilterForward::SeparableFilterForward; + using Param = param::SeparableFilter; void exec( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, _megdnn_tensor_in filter_y, _megdnn_tensor_out dst, @@ -30,10 +31,10 @@ private: template void separable_filter_exec( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, - _megdnn_tensor_in filter_y, _megdnn_tensor_out dst); + _megdnn_tensor_in filter_y, _megdnn_tensor_out dst, const Param& param); void separable_filter_exec_8u( _megdnn_tensor_in src, _megdnn_tensor_in filter_x, - _megdnn_tensor_in filter_y, _megdnn_tensor_out dst); + _megdnn_tensor_in filter_y, _megdnn_tensor_out dst, const Param& param); }; } // namespace x86 diff --git a/dnn/src/x86/warp_perspective/warp_perspective_cv.cpp b/dnn/src/x86/warp_perspective/warp_perspective_cv.cpp index 7f733f7d..afe4c9d7 100644 --- a/dnn/src/x86/warp_perspective/warp_perspective_cv.cpp +++ b/dnn/src/x86/warp_perspective/warp_perspective_cv.cpp @@ -174,7 +174,7 @@ void megdnn::x86::warp_perspective_cv_exec( "unsupported src channel: %zu, avaiable channel size: 1/2/3", ch); const float* trans_ptr = trans.ptr(); const int* midx_ptr = nullptr; - if (mat_idx.raw_ptr) { + if (mat_idx.raw_ptr()) { megdnn_assert(mat_idx.layout.ndim == 1); midx_ptr = mat_idx.ptr(); } diff --git a/dnn/test/aarch64/batched_matrix_mul.cpp b/dnn/test/aarch64/batched_matrix_mul.cpp index 38d492e2..02159091 100644 --- a/dnn/test/aarch64/batched_matrix_mul.cpp +++ b/dnn/test/aarch64/batched_matrix_mul.cpp @@ -13,6 +13,7 @@ #include "test/common/checker.h" #include "test/common/matrix_mul.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/aarch64/fixture.h" @@ -53,6 +54,40 @@ TEST_F(AARCH64, BATCHED_MATRIX_MUL) { } } +TEST_F(AARCH64, BATCHED_MATRIX_MUL_RECORD) { + TaskRecordChecker checker(0); + checker.set_epsilon(1e-2); + using Param = MatrixMul::Param; + // auto args = get_batch_matmul_args(); + auto args = matrix_mul::get_batched_matmul_args(); + + for (DType dtype : std::vector{dtype::Float32()}) { + for (unsigned mask = 0; mask < 4; ++mask) { + for (auto& arg : args) { + size_t b = arg.b, m = arg.m, n = arg.n, k = arg.k; + //! if test all batch sizes, the test case will time out. + if (b != 2) { + continue; + } + Param param; + param.transposeA = mask & 1; + param.transposeB = mask & 2; + TensorShape A, B; + if (param.transposeA) + A = TensorShape{b, k, m}; + else + A = TensorShape{b, m, k}; + if (param.transposeB) + B = TensorShape{b, n, k}; + else + B = TensorShape{b, k, n}; + checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).execs( + {A, B, {}}); + } + } + } +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_F(AARCH64, BATCHED_MATRIX_MUL_FP16) { Checker checker(handle()); diff --git a/dnn/test/aarch64/conv_bias.cpp b/dnn/test/aarch64/conv_bias.cpp index 0cf83cec..42941cd5 100644 --- a/dnn/test/aarch64/conv_bias.cpp +++ b/dnn/test/aarch64/conv_bias.cpp @@ -15,6 +15,7 @@ #include "test/common/checker.h" #include "test/common/conv_bias.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" namespace megdnn { @@ -85,6 +86,14 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2) { handle(), "ARMV8F32STRD2"); } +TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_RECORD) { + auto args = conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false); + TaskRecordChecker checker(0); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC void checker_conv_bias_fp16( std::vector args, Handle* handle, const char* algo_name, diff --git a/dnn/test/aarch64/pooling.cpp b/dnn/test/aarch64/pooling.cpp index 468539af..c618521e 100644 --- a/dnn/test/aarch64/pooling.cpp +++ b/dnn/test/aarch64/pooling.cpp @@ -12,6 +12,7 @@ #include "test/common/checker.h" #include "test/common/pooling.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -24,6 +25,14 @@ TEST_F(AARCH64, POOLING) { } } +TEST_F(AARCH64, POOLING_RECORD) { + TaskRecordChecker checker(0); + auto args = pooling::get_args(); + for (auto&& arg : args) { + checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); + } +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/aarch64/relayout.cpp b/dnn/test/aarch64/relayout.cpp index 2383420b..57dea5f4 100644 --- a/dnn/test/aarch64/relayout.cpp +++ b/dnn/test/aarch64/relayout.cpp @@ -14,6 +14,7 @@ #include "test/common/checker.h" #include "test/common/relayout.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -66,6 +67,20 @@ TEST_F(AARCH64, RelayoutBig) { checker.execl({src, dst}); } +TEST_F(AARCH64, RelayoutRecord) { + TaskRecordChecker checker(0); + std::vector<::megdnn::DType> dtype_vec; + dtype_vec.push_back(dtype::Float32()); + dtype_vec.push_back(dtype::Int16()); + dtype_vec.push_back(dtype::Uint16()); + dtype_vec.push_back(dtype::Int8()); + for (auto dtype : dtype_vec) { + TensorLayout src({1, 54, 112, 256}, {54, 1, 16384, 64}, dtype); + TensorLayout dst({1, 54, 112, 256}, {1548288, 28672, 256, 1}, dtype); + checker.execl({src, dst}); + } +} + #if MEGDNN_WITH_BENCHMARK TEST_F(AARCH64, BENCHMARK_Relayout) { diff --git a/dnn/test/aarch64/rotate.cpp b/dnn/test/aarch64/rotate.cpp index 7bc3e096..070859db 100644 --- a/dnn/test/aarch64/rotate.cpp +++ b/dnn/test/aarch64/rotate.cpp @@ -11,6 +11,7 @@ #include "test/common/rotate.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/aarch64/fixture.h" @@ -30,6 +31,19 @@ TEST_F(AARCH64, ROTATE) { } } +TEST_F(AARCH64, ROTATE_RECORD) { + using namespace rotate; + std::vector args = get_args(); + TaskRecordChecker checker(0); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, arg.dtype) + .set_dtype(1, arg.dtype) + .execs({arg.src, {}}); + } +} + TEST_F(AARCH64, BENCHMARK_ROTATE) { using namespace rotate; using Param = param::Rotate; diff --git a/dnn/test/aarch64/warp_perspective.cpp b/dnn/test/aarch64/warp_perspective.cpp index a4172435..f9e46f1f 100644 --- a/dnn/test/aarch64/warp_perspective.cpp +++ b/dnn/test/aarch64/warp_perspective.cpp @@ -17,6 +17,7 @@ #include "test/common/checker.h" #include "test/common/random_state.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/warp_perspective.h" @@ -150,6 +151,133 @@ TEST_F(AARCH64, WARP_PERSPECTIVE_CV) { } } +TEST_F(AARCH64, WARP_PERSPECTIVE_CV_RECORD) { + //! Just for the format NHWC + TaskRecordChecker checker(0); + param::WarpPerspective param; + class ResizeMatRNG : public RNG { + void gen(const TensorND& tensor_) override { + auto& gen = RandomState::generator(); + std::uniform_real_distribution pdist3(1.9f, 3.1f); + std::uniform_real_distribution pdist(0.9f, 1.1f); + std::uniform_real_distribution pdisth(0.4f, 0.6f); + std::uniform_real_distribution ndist(-1.1f, -0.9f); + std::uniform_real_distribution ndist3(-3.1f, -1.9f); + std::uniform_real_distribution ndisth(-0.6f, -0.4f); + std::uniform_int_distribution dice(0, 5); + float* ptr = tensor_.ptr(); + auto N = tensor_.layout.shape[0]; + for (size_t n = 0; n < N; ++n) { + for (size_t i = 0; i < 9; ++i) { + switch (dice(gen)) { + case 0: + ptr[i] = pdist3(gen); + break; + case 1: + ptr[i] = pdist(gen); + break; + case 2: + ptr[i] = pdisth(gen); + break; + case 3: + ptr[i] = ndist(gen); + break; + case 4: + ptr[i] = ndist3(gen); + break; + case 5: + ptr[i] = ndisth(gen); + break; + } + } + // is resize? + if (n & 1) { + ptr[1] = 0; + ptr[3] = 0; + ptr[6] = ptr[7] = 0; + } + ptr += 9; + } + } + } rng; + + using BMode = param::WarpPerspective::BorderMode; + param.format = param::WarpPerspective::Format::NHWC; + // add for nearest test + param.imode = param::WarpPerspective::InterpolationMode::NEAREST; + for (auto mode : + {BMode::REFLECT_101, BMode::REPLICATE, BMode::REFLECT, BMode::WRAP, + BMode::CONSTANT}) { + param.bmode = mode; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 1); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{2, 5, 5, 1}, {4, 3, 3}, {4}, {4, 5, 5, 1}}); + } + // resize nan case + UniformFloatRNG rng_zero(0, 0); + checker.set_rng(1, &rng_zero); + { + param.bmode = BMode::CONSTANT; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 999); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{1000, 2, 10, 3}, {2000, 3, 3}, {2000}, {2000, 2, 12, 3}}); + } + + // add linear test + param.imode = param::WarpPerspective::InterpolationMode::INTER_LINEAR; + for (auto mode : + {BMode::REFLECT_101, BMode::REPLICATE, BMode::REFLECT, BMode::WRAP, + BMode::CONSTANT}) { + param.bmode = mode; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 9); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{10, 128, 108, 3}, {20, 3, 3}, {20}, {20, 56, 128, 3}}); + } + // resize nan case + checker.set_rng(1, &rng_zero); + { + param.bmode = BMode::CONSTANT; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 999); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{1000, 2, 10, 3}, {2000, 3, 3}, {2000}, {2000, 2, 12, 3}}); + } + + auto args = warp_perspective::get_cv_args(); + for (auto&& arg : args) { + ConstValue rng(0.f); + checker.set_param(arg.param) + .set_rng(2, &rng) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int32()) + .set_dtype(3, dtype::Uint8()) + .execs({arg.src, arg.trans, arg.mat_idx, arg.dst}); + } + + for (auto&& arg : args) { + ConstValue rng(0.f); + checker.set_param(arg.param) + .set_rng(2, &rng) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int32()) + .set_dtype(3, dtype::Float32()) + .execs({arg.src, arg.trans, arg.mat_idx, arg.dst}); + } +} + #if MEGDNN_WITH_BENCHMARK TEST_F(AARCH64, BENCHMARK_WARP_PERSPECTIVE_FORWARD) { Benchmarker benchmarker(handle()); diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp index a28781bf..aced356c 100644 --- a/dnn/test/arm_common/conv_bias.cpp +++ b/dnn/test/arm_common/conv_bias.cpp @@ -19,6 +19,7 @@ #include "test/common/checker.h" #include "test/common/conv_bias.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/common/workspace_wrapper.h" @@ -57,6 +58,29 @@ TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) { } } +TEST_F(ARM_COMMON, CONV_BIAS_RECORD) { + using namespace conv_bias; + std::vector args = get_quantized_args(); + TaskRecordChecker checker(0); +#if MEGDNN_ARMV7 + checker.set_epsilon(1); +#endif + UniformIntRNG rng{-50, 50}; + for (auto&& arg : args) { + if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1) + continue; + checker.set_dtype(0, dtype::QuantizedS8(0.41113496f)) + .set_dtype(1, dtype::QuantizedS8(0.01887994f)) + .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f)) + .set_dtype(4, dtype::QuantizedS8(0.49550694f)) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4) { using namespace conv_bias; std::vector args = get_winograd_mk_packed_args(); diff --git a/dnn/test/arm_common/elemwise.cpp b/dnn/test/arm_common/elemwise.cpp index aeca923d..95dba507 100644 --- a/dnn/test/arm_common/elemwise.cpp +++ b/dnn/test/arm_common/elemwise.cpp @@ -13,6 +13,7 @@ #include "test/arm_common/fixture.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "megdnn/opr_param_defs.h" #include "megdnn/oprs/general.h" @@ -380,6 +381,40 @@ TEST_F(ARM_COMMON, ELEMWISE_FORWARD_N1HW_FP32_BCAST) { run(Mode::SUB); } +TEST_F(ARM_COMMON, ELEMWISE_FORWARD_TERNARY_RECORD) { + using Mode = ElemwiseForward::Param::Mode; + TaskRecordChecker checker(0); + checker.set_param(Mode::FUSE_MUL_ADD3); + + auto run = [&] { + //! nchw44 + checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}}); + checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}}); + + //! nchw88 + checker.execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}}); + checker.execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}}); + + checker.execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}}); + checker.execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}}); + }; + + // case int + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + checker.set_dtype(2, dtype::Int32()); + run(); + + // case float + UniformFloatRNG rng(1e-5, 7e1); + checker.set_rng(0, &rng); + checker.set_epsilon(1e-5); + checker.set_dtype(0, dtype::Float32()); + checker.set_dtype(1, dtype::Float32()); + checker.set_dtype(2, dtype::Float32()); + run(); +} + #if MEGDNN_WITH_BENCHMARK namespace { void run_elemwise_benchmark( diff --git a/dnn/test/arm_common/elemwise_multi_type.cpp b/dnn/test/arm_common/elemwise_multi_type.cpp index 72ea4d6b..b5c2a71c 100644 --- a/dnn/test/arm_common/elemwise_multi_type.cpp +++ b/dnn/test/arm_common/elemwise_multi_type.cpp @@ -14,6 +14,7 @@ #include "megdnn/oprs.h" #include "test/arm_common/fixture.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/common/timer.h" #include "test/common/workspace_wrapper.h" @@ -80,6 +81,56 @@ TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_UNARY) { } } +TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_UNARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + + std::unique_ptr rng; + for (auto mode : + {Mode::QRELU, Mode::QABS, Mode::QSIGMOID, Mode::QEXP, Mode::QTANH, + Mode::QFAST_TANH, Mode::QH_SWISH}) { + checker.set_param({mode}); + + for (DType src_type : std::vector{ + dtype::QuantizedS8(1.4f), + dtype::Quantized8Asymm(1.3f, static_cast(4)), + dtype::QuantizedS32(1.3f)}) { + checker.set_dtype(0, src_type); + if (src_type.enumv() == DTypeEnum::QuantizedS8) { + rng = std::make_unique(-127, 127); + checker.set_dtype(1, dtype::QuantizedS8(1.7f)); + } else if (src_type.enumv() == DTypeEnum::Quantized8Asymm) { + rng = std::make_unique(0, 255); + checker.set_dtype( + 1, dtype::Quantized8Asymm(1.7f, static_cast(10))); + } else { + rng = std::make_unique(INT16_MIN >> 1, INT16_MAX >> 1); + } + + checker.set_rng(0, rng.get()); + auto run = [&]() { + checker.execs({{3, 4, 5, 6}, {}}); + + checker.execs({{3}, {}}); + checker.execs({{9}, {}}); + checker.execs({{17}, {}}); + }; + + if (src_type.enumv() == DTypeEnum::QuantizedS32) { + for (DType dst_type : std::vector{ + dtype::QuantizedS8(32718.6f), + dtype::Quantized8Asymm( + 32729.6f, static_cast(128))}) { + checker.set_dtype(1, dst_type); + run(); + } + } else { + run(); + } + } + } +} + TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_BINARY) { using Mode = ElemwiseMultiType::Param::Mode; Checker checker(handle()); @@ -198,6 +249,111 @@ TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_BINARY) { run(); } +TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_BINARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + auto run = [&]() { + //! nchw44 + checker.execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}}); + //! VEC + SCALAR + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + checker.execs({{1, 1, 1, 1}, {3, 4, 5, 6}, {}}); + + //! VEC + 1C11 + checker.execs({{3, 4, 5, 6}, {1, 4, 1, 1}, {}}); + checker.execs({{1, 4, 1, 1}, {3, 4, 5, 6}, {}}); + + //! VEC + VEC + checker.execs({{3}, {3}, {}}); + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {}}); + }; + + // qint32 to qint8/quint8 + for (auto mode : {Mode::QADD, Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_H_SWISH}) { + checker.set_param({mode}); + UniformIntRNG rng{INT16_MIN >> 1, INT16_MAX >> 1}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_dtype(0, dtype::QuantizedS32(1.3f)) + .set_dtype(1, dtype::QuantizedS32(1.2f)); + + for (DType dst_type : std::vector{ + dtype::QuantizedS8(32718.6f), + dtype::Quantized8Asymm(32729.6f, static_cast(128))}) { + checker.set_dtype(2, dst_type); + run(); + } + } + + for (auto mode : + {Mode::QMUL, Mode::QADD, Mode::QMIN, Mode::QMAX, Mode::QSUB, + Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_SIGMOID, Mode::QFUSE_ADD_H_SWISH}) { + checker.set_param({mode}); + + // qint8 to qint8 + UniformIntRNG rng_int8{-127, 127}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.35f)) + .set_dtype(1, dtype::QuantizedS8(1.15f)) + .set_dtype(2, dtype::QuantizedS8(1.75f)); + + run(); + // quint8 to quint8 + UniformIntRNG rng_uint8{0, 255}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))); + + run(); + } + + //! TRUE_DIV : 0.0 / 0.0 will fail + checker.set_param({Mode::QTRUE_DIV}); + UniformIntRNG rng_int8_1{-127, 127}; + UniformIntRNG rng_int8_2{-127, -1}; + checker.set_rng(0, &rng_int8_1) + .set_rng(1, &rng_int8_2) + .set_dtype(0, dtype::QuantizedS8(1.4f)) + .set_dtype(1, dtype::QuantizedS8(1.1f)) + .set_dtype(2, dtype::QuantizedS8(1.7f)); + + run(); + + // quint8 to quint8 + UniformIntRNG rng_uint8_1{0, 255}; + UniformIntRNG rng_uint8_2{0, 127}; + checker.set_rng(0, &rng_uint8_1) + .set_rng(1, &rng_uint8_2) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))); + + run(); + + //! TANH + checker.set_param({Mode::QFUSE_ADD_TANH}); + UniformIntRNG rng_int8{-5, 5}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.1f)) + .set_dtype(1, dtype::QuantizedS8(1.4f)) + .set_dtype(2, dtype::QuantizedS8(1.7f)); + + run(); + + UniformIntRNG rng_uint8{123, 133}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.1f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.4f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.7f, static_cast(128))); + + run(); +} + TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_TERNARY) { using Mode = ElemwiseMultiType::Param::Mode; Checker checker(handle()); @@ -253,4 +409,51 @@ TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_TERNARY) { } } +TEST_F(ARM_COMMON, ELEMWISE_QUANTIZED_MODE_TERNARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + + auto run = [&]() { + //! nchw44 + checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}}); + checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}}); + + //! nchw44 + checker.execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}}); + checker.execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}}); + + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + checker.execs({{1, 4, 1, 1}, {3, 4, 5, 6}, {1, 4, 1, 1}, {}}); + + checker.execs({{3}, {3}, {3}, {}}); + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}, {}}); + }; + + for (auto mode : {Mode::QFUSE_MUL_ADD3}) { + checker.set_param({mode}); + + // qint8 to qint8 + UniformIntRNG rng_int8{-127, 127}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_rng(2, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.45f)) + .set_dtype(1, dtype::QuantizedS8(1.15f)) + .set_dtype(2, dtype::QuantizedS8(1.75f)) + .set_dtype(3, dtype::QuantizedS8(1.35f)); + run(); + + // quint8 to quint8 + UniformIntRNG rng_uint8{0, 225}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_rng(2, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))) + .set_dtype(3, dtype::Quantized8Asymm(1.45f, static_cast(128))); + run(); + } +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/test/arm_common/group_local.cpp b/dnn/test/arm_common/group_local.cpp index eea8606c..503e75b9 100644 --- a/dnn/test/arm_common/group_local.cpp +++ b/dnn/test/arm_common/group_local.cpp @@ -13,6 +13,7 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/group_local.h" +#include "test/common/task_record_check.h" #include "test/common/timer.h" namespace megdnn { @@ -40,6 +41,29 @@ TEST_F(ARM_COMMON, GROUP_LOCAL_FORWARD) { } #endif } + +TEST_F(ARM_COMMON, GROUP_LOCAL_FORWARD_RECORD) { + auto args = group_local::get_args(); + TaskRecordChecker checker(0); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.sshape(), arg.fshape(), arg.dshape()}); + } + + NormalRNG rng(10.f); + checker.set_rng(0, &rng).set_rng(1, &rng); + args = group_local::get_args_for_fp16(); + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + for (auto&& arg : args) { + checker.set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()); + checker.set_epsilon(1e-2); + checker.set_param(arg.param).execs({arg.sshape(), arg.fshape(), arg.dshape()}); + } +#endif +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/arm_common/local.cpp b/dnn/test/arm_common/local.cpp index 76d513e6..dfcb7576 100644 --- a/dnn/test/arm_common/local.cpp +++ b/dnn/test/arm_common/local.cpp @@ -13,6 +13,7 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/local.h" +#include "test/common/task_record_check.h" #include "test/common/timer.h" namespace megdnn { @@ -41,6 +42,28 @@ TEST_F(ARM_COMMON, LOCAL_FORWARD) { #endif } +TEST_F(ARM_COMMON, LOCAL_FORWARD_RECORD) { + auto args = local::get_args(); + TaskRecordChecker checker(0); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.sshape(), arg.fshape(), arg.dshape()}); + } + + NormalRNG rng(10.f); + checker.set_rng(0, &rng).set_rng(1, &rng); + args = local::get_args_for_fp16(); + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + for (auto&& arg : args) { + checker.set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()); + checker.set_epsilon(1e-2); + checker.set_param(arg.param).execs({arg.sshape(), arg.fshape(), arg.dshape()}); + } +#endif +} + #if MEGDNN_WITH_BENCHMARK TEST_F(ARM_COMMON, BENCHMARK_LOCAL_FORWARD) { auto run = [&](const TensorShapeArray& shapes, Param param) { diff --git a/dnn/test/arm_common/matrix_mul.cpp b/dnn/test/arm_common/matrix_mul.cpp index fd317c0f..8d4d2d0a 100644 --- a/dnn/test/arm_common/matrix_mul.cpp +++ b/dnn/test/arm_common/matrix_mul.cpp @@ -15,6 +15,7 @@ #include "test/common/checker.h" #include "test/common/matrix_mul.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #if MGB_ENABLE_CPUINFO #include "cpuinfo.h" @@ -309,6 +310,31 @@ TEST_F(ARM_COMMON, FP32_GEMV_MK4) { run(M, K); } +TEST_F(ARM_COMMON, MATRIX_MUL_RECORD) { + TaskRecordChecker checker(0); + checker.set_epsilon(1e-2); + NormalRNG rng(2.f); + checker.set_rng(0, &rng).set_rng(1, &rng); + + using Param = MatrixMul::Param; + auto args = matrix_mul::get_matmul_args_no_mask(); + + for (auto& arg : args) { + size_t m = arg.m, n = arg.n, k = arg.k; + Param param; + param.transposeA = false; + param.transposeB = false; + TensorShape A, B; + A = TensorShape{m, k}; + B = TensorShape{k, n}; + checker.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .execs({A, B, {}}); + } +} + #if MEGDNN_WITH_BENCHMARK TEST_F(ARM_COMMON, BENCHMARK_SGEMV) { diff --git a/dnn/test/arm_common/pooling.cpp b/dnn/test/arm_common/pooling.cpp index 56520554..a199c18f 100644 --- a/dnn/test/arm_common/pooling.cpp +++ b/dnn/test/arm_common/pooling.cpp @@ -14,6 +14,7 @@ #include "test/common/checker.h" #include "test/common/pooling.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -67,6 +68,55 @@ TEST_F(ARM_COMMON, POOLING) { // clang-format on } +TEST_F(ARM_COMMON, POOLING_RECORD) { + using Param = param::Pooling; + TaskRecordChecker checker(0); + // clang-format off + for (size_t ih: {2, 3, 5, 7, 11, 13, 17}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + // clang-format on +} + TEST_F(ARM_COMMON, POOLING_INT8_W2x2_S2x2) { // clang-format off for (size_t ih: {2, 3, 7, 13, 52, 53, 54, 55}) diff --git a/dnn/test/arm_common/pooling_multi_thread.cpp b/dnn/test/arm_common/pooling_multi_thread.cpp index c35c13e0..89ba9e69 100644 --- a/dnn/test/arm_common/pooling_multi_thread.cpp +++ b/dnn/test/arm_common/pooling_multi_thread.cpp @@ -17,6 +17,7 @@ #include "test/common/checker.h" #include "test/common/pooling.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -56,6 +57,40 @@ TEST_F(ARM_COMMON_MULTI_THREADS, POOLING) { } } +TEST_F(ARM_COMMON_MULTI_THREADS, POOLING_RECORD) { + using Param = param::Pooling; + TaskRecordChecker checker(0); + for (size_t ih : {2, 3, 5, 7, 11, 13, 17}) + for (size_t iw : {2, 3, 5, 7, 11, 13, 17}) + for (size_t p : {1, 2}) { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } +} + std::vector> get_nchw44_pool_args( size_t filter, size_t stride) { constexpr size_t ic_step = 4; diff --git a/dnn/test/arm_common/reduce.cpp b/dnn/test/arm_common/reduce.cpp index 772fedb8..f74abe24 100644 --- a/dnn/test/arm_common/reduce.cpp +++ b/dnn/test/arm_common/reduce.cpp @@ -13,6 +13,7 @@ #include "megdnn/oprs.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" using namespace megdnn; using namespace test; @@ -86,6 +87,75 @@ TEST_F(ARM_COMMON, REDUCE) { } } +TEST_F(ARM_COMMON, REDUCE_RECORD) { + using Param = Reduce::Param; + using Mode = Param::Mode; + TaskRecordChecker checker(0); + UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1}; + checker.set_rng(0, &rng); + struct Config { + Param param; + DType dtype; + TensorShape shape; + Config(Param param, DType dtype, TensorShape shape) + : param(param), dtype(dtype), shape(shape) {} + }; + std::vector configs; + for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN}) + for (auto dtype : std::vector{ + dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f), + dtype::Quantized8Asymm(1.3f, static_cast(3))}) + for (int32_t axis : {0, 1, 2}) { + for (size_t A : {1, 3, 5}) { + for (size_t B : {4, 6, 9, 16, 33, 45}) { + for (size_t C : {4, 6, 9, 16, 33, 45}) { + TensorShape shape{A, B, C}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + } + } + } + for (auto&& config : configs) { + auto&& dtype = config.dtype; + auto&& param = config.param; + auto&& shape = config.shape; + + checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); + } + configs.clear(); + for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR}) + for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) + for (int32_t axis : {0, 1, 2}) { + for (size_t A : {1, 3, 5}) { + for (size_t B : {4, 6, 9, 16, 33, 45}) { + for (size_t C : {4, 6, 9, 16, 33, 45}) { + TensorShape shape{A, B, C}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + } + } + } + + UniformFloatRNG rng_float(-2, 2); + checker.set_rng(0, &rng_float); + checker.set_epsilon(1e-1); + for (auto&& config : configs) { + auto&& dtype = config.dtype; + auto&& param = config.param; + auto&& shape = config.shape; + if (dtype == dtype::Float16()) + checker.set_epsilon(1e-1); + else + checker.set_epsilon(1e-3); + + checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); + } +} + #if MEGDNN_WITH_BENCHMARK TEST_F(ARM_COMMON, BENCHMARK_REDUCE) { auto run = [&](size_t A, size_t B, size_t C, size_t axis, diff --git a/dnn/test/arm_common/resize.cpp b/dnn/test/arm_common/resize.cpp index 2d004cfd..ca7ccfba 100644 --- a/dnn/test/arm_common/resize.cpp +++ b/dnn/test/arm_common/resize.cpp @@ -12,6 +12,7 @@ #include "test/common/resize.h" #include "test/arm_common/fixture.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -52,6 +53,26 @@ TEST_F(ARM_COMMON, RESIZE_CV) { } } +TEST_F(ARM_COMMON, RESIZE_CV_RECORD) { + std::vector args = get_cv_args(); + TaskRecordChecker checker(0); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_epsilon(1 + 1e-3) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .execs({arg.src, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, arg.dst}); + } +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_F(ARM_COMMON, RESIZE_NCHW_FP16) { std::vector args; diff --git a/dnn/test/arm_common/separable_filter.cpp b/dnn/test/arm_common/separable_filter.cpp index f7a56a86..5c6e416f 100644 --- a/dnn/test/arm_common/separable_filter.cpp +++ b/dnn/test/arm_common/separable_filter.cpp @@ -12,6 +12,7 @@ #include "test/arm_common/fixture.h" #include "test/common/checker.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -34,6 +35,24 @@ TEST_F(ARM_COMMON, SEPARABLE_FILTER) { } } +TEST_F(ARM_COMMON, SEPARABLE_FILTER_RECORD) { + using namespace separable_filter; + std::vector args = get_args(); + TaskRecordChecker checker(0); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); + } + + checker.set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Uint8()) + .set_epsilon(1 + 1e-3); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); + } +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/arm_common/type_cvt.cpp b/dnn/test/arm_common/type_cvt.cpp index 4bdf9e58..a06b46f1 100644 --- a/dnn/test/arm_common/type_cvt.cpp +++ b/dnn/test/arm_common/type_cvt.cpp @@ -12,6 +12,7 @@ #include "test/common/checker.h" #include "test/arm_common/fixture.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -88,6 +89,78 @@ TEST_F(ARM_COMMON, TYPE_CVT) { .execs({{1, 32, 24, 128}, {1, 32, 24, 128}}); } +TEST_F(ARM_COMMON, TYPE_CVT_RECORD) { + TaskRecordChecker checker(0); + UniformIntRNG rng{INT32_MIN >> 1, INT32_MAX >> 1}; + UniformIntRNG rng8{INT8_MIN >> 1, INT8_MAX >> 1}; + + for (size_t size : {1, 7, 15, 33, 10000}) { + checker.set_rng(0, &rng); + checker.set_dtype(0, dtype::QuantizedS32(0.0000113264f)) + .set_dtype( + 1, dtype::Quantized8Asymm(0.018909f, static_cast(3))) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS32(0.0003f)) + .set_dtype(1, dtype::Quantized8Asymm(0.1f, static_cast(3))) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS32(0.000815917f)) + .set_dtype(1, dtype::QuantizedS8(0.245121f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS32(0.0003f)) + .set_dtype(1, dtype::QuantizedS8(0.2f)) + .execs({{size}, {size}}); + + checker.set_rng(0, &rng8); + + //! we should not use so large random value, otherwise it may cause + //! compute error + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::QuantizedS8(0.245121f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Quantized8Asymm(0.1f, static_cast(3))) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS32(0.0004f)) + .set_dtype(1, dtype::QuantizedS32(0.0002f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS8(0.3f)) + .set_dtype(1, dtype::QuantizedS8(0.2f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::Quantized8Asymm(0.3f, static_cast(8))) + .set_dtype(1, dtype::Quantized8Asymm(0.1f, static_cast(3))) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS8(0.245121f)) + .set_dtype(1, dtype::QuantizedS32(0.000815917f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::QuantizedS8(0.2f)) + .set_dtype(1, dtype::QuantizedS32(0.0003f)) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float16()) + .execs({{size}, {size}}); + + checker.set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float32()) + .execs({{size}, {size}}); + } + + UniformIntRNG narrow_rng{-40000, 40000}; + checker.set_rng(0, &narrow_rng); + checker.set_dtype(0, dtype::QuantizedS32(0.000163794f)) + .set_dtype(1, dtype::Quantized8Asymm(0.0479196f, static_cast(144))) + .execs({{1, 32, 24, 128}, {1, 32, 24, 128}}); +} + TEST_F(ARM_COMMON, TYPE_CVT_16_F32) { Checker checker(handle()); UniformIntRNG rng{INT16_MIN >> 1, INT16_MAX >> 1}; diff --git a/dnn/test/arm_common/warp_affine.cpp b/dnn/test/arm_common/warp_affine.cpp index 13708c52..785a4879 100644 --- a/dnn/test/arm_common/warp_affine.cpp +++ b/dnn/test/arm_common/warp_affine.cpp @@ -11,6 +11,7 @@ #include "test/common/warp_affine.h" #include "test/arm_common/fixture.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -38,6 +39,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, WARP_AFFINE_CV) { } } +TEST_F(ARM_COMMON_MULTI_THREADS, WARP_AFFINE_CV_RECORD) { + using namespace warp_affine; + std::vector args = get_cv_args(); + TaskRecordChecker checker(0); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_epsilon(1 + 1e-3) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Uint8()) + .execs({arg.src, arg.trans, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .execs({arg.src, arg.trans, arg.dst}); + } +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/arm_common/warp_perspective.cpp b/dnn/test/arm_common/warp_perspective.cpp index 1afadf15..01160a42 100644 --- a/dnn/test/arm_common/warp_perspective.cpp +++ b/dnn/test/arm_common/warp_perspective.cpp @@ -17,7 +17,7 @@ #include "test/common/checker.h" #include "test/common/random_state.h" #include "test/common/rng.h" - +#include "test/common/task_record_check.h" #include "test/common/warp_perspective.h" namespace megdnn { @@ -150,6 +150,133 @@ TEST_F(ARM_COMMON, WARP_PERSPECTIVE_CV) { } } +TEST_F(ARM_COMMON, WARP_PERSPECTIVE_CV_RECORD) { + //! Just for the format NHWC + TaskRecordChecker checker(0); + param::WarpPerspective param; + class ResizeMatRNG : public RNG { + void gen(const TensorND& tensor_) override { + auto& gen = RandomState::generator(); + std::uniform_real_distribution pdist3(1.9f, 3.1f); + std::uniform_real_distribution pdist(0.9f, 1.1f); + std::uniform_real_distribution pdisth(0.4f, 0.6f); + std::uniform_real_distribution ndist(-1.1f, -0.9f); + std::uniform_real_distribution ndist3(-3.1f, -1.9f); + std::uniform_real_distribution ndisth(-0.6f, -0.4f); + std::uniform_int_distribution dice(0, 5); + float* ptr = tensor_.ptr(); + auto N = tensor_.layout.shape[0]; + for (size_t n = 0; n < N; ++n) { + for (size_t i = 0; i < 9; ++i) { + switch (dice(gen)) { + case 0: + ptr[i] = pdist3(gen); + break; + case 1: + ptr[i] = pdist(gen); + break; + case 2: + ptr[i] = pdisth(gen); + break; + case 3: + ptr[i] = ndist(gen); + break; + case 4: + ptr[i] = ndist3(gen); + break; + case 5: + ptr[i] = ndisth(gen); + break; + } + } + // is resize? + if (n & 1) { + ptr[1] = 0; + ptr[3] = 0; + ptr[6] = ptr[7] = 0; + } + ptr += 9; + } + } + } rng; + + using BMode = param::WarpPerspective::BorderMode; + param.format = param::WarpPerspective::Format::NHWC; + // add for nearest test + param.imode = param::WarpPerspective::InterpolationMode::NEAREST; + for (auto mode : + {BMode::REFLECT_101, BMode::REPLICATE, BMode::REFLECT, BMode::WRAP, + BMode::CONSTANT}) { + param.bmode = mode; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 9); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{10, 128, 108, 3}, {20, 3, 3}, {20}, {20, 56, 128, 3}}); + } + // resize nan case + UniformFloatRNG rng_zero(0, 0); + checker.set_rng(1, &rng_zero); + { + param.bmode = BMode::CONSTANT; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 999); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{1000, 2, 10, 3}, {1000, 3, 3}, {1000}, {1000, 2, 12, 3}}); + } + + // add linear test + param.imode = param::WarpPerspective::InterpolationMode::INTER_LINEAR; + for (auto mode : + {BMode::REFLECT_101, BMode::REPLICATE, BMode::REFLECT, BMode::WRAP, + BMode::CONSTANT}) { + param.bmode = mode; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 9); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{10, 128, 108, 3}, {20, 3, 3}, {20}, {20, 56, 128, 3}}); + } + // resize nan case + checker.set_rng(1, &rng_zero); + { + param.bmode = BMode::CONSTANT; + param.border_val = 1.737; + checker.set_param(param); + UniformIntRNG rng(0, 999); + checker.set_rng(2, &rng); + checker.set_dtype(2, dtype::Int32()); + checker.exec({{1000, 2, 10, 3}, {2000, 3, 3}, {2000}, {2000, 2, 12, 3}}); + } + + auto args = warp_perspective::get_cv_args(); + for (auto&& arg : args) { + ConstValue rng(0.f); + checker.set_param(arg.param) + .set_rng(2, &rng) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int32()) + .set_dtype(3, dtype::Uint8()) + .execs({arg.src, arg.trans, arg.mat_idx, arg.dst}); + } + + for (auto&& arg : args) { + ConstValue rng(0.f); + checker.set_param(arg.param) + .set_rng(2, &rng) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int32()) + .set_dtype(3, dtype::Float32()) + .execs({arg.src, arg.trans, arg.mat_idx, arg.dst}); + } +} + TEST_F(ARM_COMMON_MULTI_THREADS, WARP_PERSPECTIVE_CV) { //! Just for the format NHWC Checker checker(handle()); diff --git a/dnn/test/atlas/checksum.cpp b/dnn/test/atlas/checksum.cpp index aae96b78..7980cca3 100644 --- a/dnn/test/atlas/checksum.cpp +++ b/dnn/test/atlas/checksum.cpp @@ -25,7 +25,7 @@ TEST_F(ATLAS, CHECKSUM_FORWARD) { auto aligned_size = size + ((512 - size % 512) % 512); auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { TensorND tensor; - tensor.raw_ptr = ptr; + tensor.reset_ptr(ptr); tensor.layout.init_contiguous_stride({size}); tensor.layout.dtype = dtype::Byte(); WorkspaceWrapper workspace( diff --git a/dnn/test/cambricon/checksum.cpp b/dnn/test/cambricon/checksum.cpp index ad28f63c..63a208f0 100644 --- a/dnn/test/cambricon/checksum.cpp +++ b/dnn/test/cambricon/checksum.cpp @@ -24,7 +24,7 @@ TEST_F(CAMBRICON, CHECKSUM_FORWARD) { auto aligned_size = size + ((512 - size % 512) % 512); auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { TensorND tensor; - tensor.raw_ptr = ptr; + tensor.reset_ptr(ptr); tensor.layout.init_contiguous_stride({size}); tensor.layout.dtype = dtype::Byte(); WorkspaceWrapper workspace( diff --git a/dnn/test/common/accuracy_shake_checker.h b/dnn/test/common/accuracy_shake_checker.h index e22f445d..fc330b8f 100644 --- a/dnn/test/common/accuracy_shake_checker.h +++ b/dnn/test/common/accuracy_shake_checker.h @@ -357,10 +357,10 @@ void AccuracyShakeChecker::init_host_values() { rng = m_default_rng.get(); rng->gen(tensor_single_batch); - dt_byte* raw_storage_cur = static_cast(tensor_cur.raw_ptr) + + dt_byte* raw_storage_cur = static_cast(tensor_cur.raw_ptr()) + tensor_cur.layout.span().low_byte; dt_byte* raw_storage_single_batch = - static_cast(tensor_single_batch.raw_ptr) + + static_cast(tensor_single_batch.raw_ptr()) + tensor_single_batch.layout.span().low_byte; const size_t step = tensor_single_batch.layout.span().dist_byte(); if (tensor_cur.layout.eq_shape(tensor_single_batch.layout)) { diff --git a/dnn/test/common/benchmarker.h b/dnn/test/common/benchmarker.h index 9c8d5045..dfa60c3a 100644 --- a/dnn/test/common/benchmarker.h +++ b/dnn/test/common/benchmarker.h @@ -174,9 +174,9 @@ float BenchmarkerBase::exec(TensorLayoutArray layouts) { auto trans_func = [handle](const TensorLayout& layout) { auto span = layout.span(); TensorND res; - res.raw_ptr = + res.reset_ptr( static_cast(megdnn_malloc(handle, span.dist_byte())) + - span.low_byte; + span.low_byte); res.layout = layout; return res; }; @@ -201,7 +201,7 @@ float BenchmarkerBase::exec(TensorLayoutArray layouts) { if (tensor.layout.ndim == 0) continue; auto size = tensor.layout.span().high_byte; - megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr, size); + megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size); } if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); @@ -244,7 +244,7 @@ float BenchmarkerBase::exec(TensorLayoutArray layouts) { } auto free = [](Handle* handle, TensorNDArray& tensors) { std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) { - megdnn_free(handle, tensor.raw_ptr); + megdnn_free(handle, tensor.raw_ptr()); }); }; free(m_handle, tensors_cur); @@ -282,9 +282,9 @@ float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { auto trans_func = [handle](const TensorLayout& layout) { auto span = layout.span(); TensorND res; - res.raw_ptr = + res.reset_ptr( static_cast(megdnn_malloc(handle, span.dist_byte())) + - span.low_byte; + span.low_byte); res.layout = layout; return res; }; @@ -298,7 +298,7 @@ float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { auto size = tensor.layout.span().high_byte; if (tensor.layout.ndim == 0) continue; - megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr, size); + megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size); } if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); @@ -341,7 +341,7 @@ float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { } auto free = [](Handle* handle, TensorNDArray& tensors) { std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) { - megdnn_free(handle, tensor.raw_ptr); + megdnn_free(handle, tensor.raw_ptr()); }); }; free(m_handle, tensors_cur); diff --git a/dnn/test/common/checker.cpp b/dnn/test/common/checker.cpp index db33bb53..ab519410 100644 --- a/dnn/test/common/checker.cpp +++ b/dnn/test/common/checker.cpp @@ -113,7 +113,7 @@ void copy_tensors( auto&& tensor = src[i]; if (tensor.layout.ndim == 0) continue; - memcpy_noncontig(dest[i].raw_ptr, tensor.raw_ptr, tensor.layout, copy_impl); + memcpy_noncontig(dest[i].raw_ptr(), tensor.raw_ptr(), tensor.layout, copy_impl); } } @@ -346,20 +346,19 @@ std::shared_ptr CheckerHelper::alloc_tensors( Handle* handle, const TensorLayoutArray& layouts, const size_t offset) { auto deleter = [handle, offset](TensorValueArray* ptr) { for (auto&& i : *ptr) { - auto pdata = static_cast(i.raw_ptr) + i.layout.span().low_byte - + auto pdata = static_cast(i.raw_ptr()) + i.layout.span().low_byte - offset; megdnn_free(handle, pdata); } delete ptr; }; std::shared_ptr ret{new TensorValueArray, deleter}; + for (size_t i = 0; i < layouts.size(); ++i) { auto span = layouts[i].span(); + auto ptr = megdnn_malloc(handle, span.dist_byte() + offset); ret->emplace_back( - static_cast( - megdnn_malloc(handle, span.dist_byte() + offset)) - - span.low_byte + offset, - layouts[i]); + static_cast(ptr) - span.low_byte + offset, layouts[i]); } return ret; } @@ -376,7 +375,7 @@ void CheckerHelper::init_naive_values() { auto&& src = load[i]; auto&& dst = tensors_naive[i]; megdnn_assert(src->layout.eq_layout(dst.layout)); - memcpy_noncontig(dst.raw_ptr, src->raw_ptr, dst.layout, memcpy); + memcpy_noncontig(dst.raw_ptr(), src->raw_ptr(), dst.layout, memcpy); } return; } diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h index 72f25b28..a3fbee1d 100644 --- a/dnn/test/common/checker.h +++ b/dnn/test/common/checker.h @@ -55,6 +55,12 @@ public: Handle* handle() const { return m_handle_cur; } + CheckerHelper() { + auto tmp_handle = create_cpu_handle(2, false); + m_handle_naive = std::move(tmp_handle); + m_default_rng = std::unique_ptr(new NormalRNG()); + } + protected: //! whether to use physically contiguous (i.e. default layout) for naive //! impl @@ -111,12 +117,13 @@ protected: void copy_tensors_from_device( const TensorValueArray& dest, const TensorValueArray& src); + void check_tensors( + const TensorValueArray& expected, const TensorValueArray& computed); + private: std::shared_ptr m_tensors_naive; void init_naive_values(); - void check_tensors( - const TensorValueArray& expected, const TensorValueArray& computed); }; template > @@ -439,9 +446,9 @@ Checker& Checker::exect( template TensorND TensorValue( const TensorShape& shape, T dtype, std::initializer_list values) { - TensorND tensor; - tensor.layout = {shape, dtype}; - tensor.raw_ptr = static_cast(malloc(tensor.layout.span().dist_byte())); + TensorLayout layout{shape, dtype}; + auto buf = static_cast(malloc(layout.span().dist_byte())); + TensorND tensor{buf, layout}; megdnn_assert( values.size() == tensor.layout.total_nr_elems(), "%zu == %zu", values.size(), tensor.layout.total_nr_elems()); @@ -454,12 +461,11 @@ TensorND TensorValue( template TensorND TensorValueLowbit4(const TensorShape& shape, T dtype, std::vector values) { - TensorND tensor; - tensor.layout = {shape, dtype}; - tensor.raw_ptr = static_cast(malloc(tensor.layout.span().dist_byte())); + TensorLayout layout{shape, dtype}; + auto buf = static_cast(malloc(layout.span().dist_byte())); + TensorND tensor{buf, layout}; megdnn_assert(values.size() == tensor.layout.total_nr_elems()); auto ptr = tensor.ptr::ctype>(); - auto layout = tensor.layout; auto dim_in = shape[layout.ndim - 1]; auto elems = tensor.layout.total_nr_elems(); auto dim_out = elems / dim_in; @@ -489,8 +495,8 @@ public: ~Testcase() { // Suicide for (const auto& tensor : *this) { - if (tensor.raw_ptr) { - free(tensor.raw_ptr); + if (tensor.raw_ptr()) { + free(tensor.raw_ptr()); } } } diff --git a/dnn/test/common/cond_take.cpp b/dnn/test/common/cond_take.cpp index e284bb05..5f3f7bfa 100644 --- a/dnn/test/common/cond_take.cpp +++ b/dnn/test/common/cond_take.cpp @@ -43,7 +43,7 @@ std::vector CondTakeTestcase::make() { UniformIntRNG rng_byte(0, 255); auto fill_data = [&](TensorND data) { auto sz = data.layout.span().dist_byte(), szf = sz / sizeof(dt_float32); - auto pf = static_cast(data.raw_ptr); + auto pf = static_cast(data.raw_ptr()); data_rng.fill_fast_float32(pf, szf); auto prem = reinterpret_cast(pf + szf); @@ -57,8 +57,8 @@ std::vector CondTakeTestcase::make() { auto size0 = i.m_data.layout.span().dist_byte(), size1 = i.m_mask.layout.span().dist_byte(); i.m_mem.reset(new uint8_t[size0 + size1]); - i.m_data.raw_ptr = i.m_mem.get(); - i.m_mask.raw_ptr = i.m_mem.get() + size0; + i.m_data.reset_ptr(i.m_mem.get()); + i.m_mask.reset_ptr(i.m_mem.get() + size0); fill_data(i.m_data); auto mean = i.m_param.val; diff --git a/dnn/test/common/dct_ref.cpp b/dnn/test/common/dct_ref.cpp index 2dff8cd7..bbe2fe44 100644 --- a/dnn/test/common/dct_ref.cpp +++ b/dnn/test/common/dct_ref.cpp @@ -90,10 +90,10 @@ CheckerHelper::TensorsConstriant gen_dct_constriant( "tensors_orig[2].layout == mask_val.layout"); auto naive_handle = create_cpu_handle(2, false); megdnn_memcpy_D2D( - naive_handle.get(), tensors_orig[1].raw_ptr, mask_offset.raw_ptr, - mask_offset.layout.span().dist_byte()); + naive_handle.get(), tensors_orig[1].raw_ptr(), + mask_offset.raw_ptr(), mask_offset.layout.span().dist_byte()); megdnn_memcpy_D2D( - naive_handle.get(), tensors_orig[2].raw_ptr, mask_val.raw_ptr, + naive_handle.get(), tensors_orig[2].raw_ptr(), mask_val.raw_ptr(), mask_val.layout.span().dist_byte()); } }; diff --git a/dnn/test/common/elemwise.cpp b/dnn/test/common/elemwise.cpp index 6f503b12..90dfbf86 100644 --- a/dnn/test/common/elemwise.cpp +++ b/dnn/test/common/elemwise.cpp @@ -37,7 +37,7 @@ void fma4_extra_opr_impl(const TensorNDArray& data) { megdnn_assert(data.size() == 5); std::vector tmp_storage(data[4].layout.span().dist_byte()); TensorND tmp; - tmp.raw_ptr = tmp_storage.data(); + tmp.reset_ptr(tmp_storage.data()); tmp.layout = data[4].layout; tmp.layout.init_contiguous_stride(); auto handle = create_cpu_handle(2); diff --git a/dnn/test/common/extra_impl_helper.cpp b/dnn/test/common/extra_impl_helper.cpp index 6ee95779..38a13859 100644 --- a/dnn/test/common/extra_impl_helper.cpp +++ b/dnn/test/common/extra_impl_helper.cpp @@ -35,7 +35,7 @@ std::function extra_impl_helper( type_cvt->exec(fp32_tensors[0], tensors[0]); for (size_t i = 0; i < tensors.size(); ++i) { - free(fp32_tensors[i].raw_ptr); + free(fp32_tensors[i].raw_ptr()); } }; return std::bind(impl, std::placeholders::_1, h, std::cref(p)); diff --git a/dnn/test/common/extra_impl_helper.h b/dnn/test/common/extra_impl_helper.h index 409d9bd6..7ea7513b 100644 --- a/dnn/test/common/extra_impl_helper.h +++ b/dnn/test/common/extra_impl_helper.h @@ -47,7 +47,7 @@ std::function extra_impl_helper( } for (size_t i = 0; i < tensors.size(); ++i) { - free(fp32_tensors[i].raw_ptr); + free(fp32_tensors[i].raw_ptr()); } }; return std::bind(impl, std::placeholders::_1, h, std::cref(p)); diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index dc62deb8..42c854ce 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -172,6 +172,7 @@ struct OprProxy { template <> struct OprProxy { + WorkspaceWrapper W; static void deduce_layout(ConcatForward* opr, TensorLayoutArray& layouts) { megdnn_assert(layouts.size() >= 2); auto inp = layouts; @@ -179,7 +180,10 @@ struct OprProxy { opr->deduce_layout(inp, layouts.back()); } - static void exec(ConcatForward* opr, const TensorNDArray& tensors) { + void exec(ConcatForward* opr, const TensorNDArray& tensors) { + if (!W.valid()) { + W = WorkspaceWrapper(opr->handle(), 0); + } megdnn_assert(tensors.size() >= 2); auto inp = tensors; inp.pop_back(); @@ -191,10 +195,7 @@ struct OprProxy { auto inp_layouts = layouts; inp_layouts.pop_back(); - WorkspaceWrapper W( - opr->handle(), - opr->get_workspace_in_bytes(inp_layouts, layouts.back())); - + W.update(opr->get_workspace_in_bytes(inp_layouts, layouts.back())); auto inp_tensors = tensors; inp_tensors.pop_back(); opr->exec(inp_tensors, tensors.back(), W.workspace()); @@ -203,8 +204,12 @@ struct OprProxy { template <> struct OprProxy : DeduceLayoutProxy { - static void exec(SplitForward* opr, const TensorNDArray& tensors) { + WorkspaceWrapper W; + void exec(SplitForward* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); + if (!W.valid()) { + W = WorkspaceWrapper(opr->handle(), 0); + } auto out = tensors; out.erase(out.begin()); @@ -215,9 +220,7 @@ struct OprProxy : DeduceLayoutProxy { auto out_layouts = layouts; out_layouts.erase(out_layouts.begin()); - WorkspaceWrapper W( - opr->handle(), - opr->get_workspace_in_bytes(layouts.front(), out_layouts)); + W.update(opr->get_workspace_in_bytes(layouts.front(), out_layouts)); auto out_tensors = tensors; out_tensors.erase(out_tensors.begin()); @@ -249,7 +252,7 @@ struct OprProxyProfilingBase auto deleter = [handle](TensorNDArray* ptr) { for (auto&& i : *ptr) { auto pdata = - static_cast(i.raw_ptr) + i.layout.span().low_byte; + static_cast(i.raw_ptr()) + i.layout.span().low_byte; megdnn_free(handle, pdata); } delete ptr; diff --git a/dnn/test/common/rng.cpp b/dnn/test/common/rng.cpp index 19362df9..bd4f54de 100644 --- a/dnn/test/common/rng.cpp +++ b/dnn/test/common/rng.cpp @@ -152,7 +152,7 @@ void IIDRNG::gen(const TensorND& tensor) { cb(::megdnn::dtype::QuantizedS16) #undef cb if (tensor.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - auto ptr = static_cast(tensor.raw_ptr); + auto ptr = static_cast(tensor.raw_ptr()); if (output_is_float()) { for (size_t i = 0; i < nr_elems; i += 2) { uint8_t val0 = tensor.layout.dtype.param() @@ -173,7 +173,7 @@ void IIDRNG::gen(const TensorND& tensor) { return; } if (tensor.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { - auto ptr = static_cast(tensor.raw_ptr); + auto ptr = static_cast(tensor.raw_ptr()); if (output_is_float()) { for (size_t i = 0; i < nr_elems; i += 2) { int8_t val0 = tensor.layout.dtype.param() @@ -199,7 +199,7 @@ void IIDRNG::gen(const TensorND& tensor) { return; } if (tensor.layout.dtype.enumv() == DTypeEnum::Byte) { - memset(tensor.raw_ptr, 0, tensor.layout.access_bytes()); + memset(tensor.raw_ptr(), 0, tensor.layout.access_bytes()); return; } if (tensor.layout.dtype.enumv() == DTypeEnum::Uint16) { diff --git a/dnn/test/common/svd.cpp b/dnn/test/common/svd.cpp index b3f23d09..7364d8fb 100644 --- a/dnn/test/common/svd.cpp +++ b/dnn/test/common/svd.cpp @@ -93,13 +93,13 @@ std::vector SVDTestcase::make() { NormalRNG data_rng; auto fill_data = [&](TensorND& data) { auto sz = data.layout.span().dist_byte(), szf = sz / sizeof(dt_float32); - auto pf = static_cast(data.raw_ptr); + auto pf = static_cast(data.raw_ptr()); data_rng.fill_fast_float32(pf, szf); }; for (auto&& i : ret) { i.m_mem.reset(new dt_float32[i.m_mat.layout.span().dist_elem()]); - i.m_mat.raw_ptr = i.m_mem.get(); + i.m_mat.reset_ptr(i.m_mem.get()); fill_data(i.m_mat); } diff --git a/dnn/test/common/task_record_check.h b/dnn/test/common/task_record_check.h new file mode 100644 index 00000000..5373f1f3 --- /dev/null +++ b/dnn/test/common/task_record_check.h @@ -0,0 +1,287 @@ +/** + * \file dnn/test/common/task_record_check.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include "megdnn/oprs.h" +#include "src/common/conv_bias.h" +#include "src/common/utils.h" +#include "src/naive/handle.h" +#include "test/common/checker.h" +#include "test/common/index.h" + +namespace megdnn { +namespace test { + +//! simulation the task dispatch progress +class CpuRecordDispatcher : public MegcoreCPUDispatcher { + std::vector tasks; + bool execute_inplace = false; + +public: + void dispatch(MultiThreadingTask&& task, size_t parallelism) override { + if (execute_inplace) { + for (size_t i = 0; i < parallelism; i++) { + task(i, 0); + } + } else { + tasks.push_back([task, parallelism]() { + for (size_t i = 0; i < parallelism; i++) { + task(i, 0); + } + }); + } + } + + void dispatch(Task&& task) override { + // printf("dispatch one task with execute_inplace = %d\n", execute_inplace); + if (execute_inplace) { + task(); + } else { + tasks.push_back(task); + }; + } + + size_t nr_threads() override { return 1_z; } + + void sync() override {} + + void enable_execute_inplace() { execute_inplace = true; } + + void disable_execute_inplace() { execute_inplace = false; } + + void run_task() { + // printf("size of task : %zu\n", tasks.size()); + for (auto&& task : tasks) { + task(); + } + } + void clear_task() { tasks.clear(); } +}; + +template > +class TaskRecordChecker : public CheckerHelper { + std::shared_ptr m_dispatcher; + std::unique_ptr m_handle; + Proxy m_naive_proxy, m_cur_proxy; + +public: + using Param = typename Opr::Param; + using CheckerHelper::CheckerHelper; + + TaskRecordChecker(int debug_level = 0) { + m_dispatcher = std::make_shared(); + m_handle = create_cpu_handle_with_dispatcher(debug_level, m_dispatcher); + } + + TensorLayoutArray make_layouts(const TensorShapeArray& shapes) { + TensorLayoutArray layouts(shapes.size()); + for (size_t i = 0; i < shapes.size(); ++i) { + DType dt = + (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32()); + TensorFormat fmt = + (m_fmt.find(i) != m_fmt.end() ? m_fmt[i] : TensorFormat{}); + layouts[i] = TensorLayout(shapes[i], dt, fmt); + } + return layouts; + } + + /*! + * \brief execute opr on current param/dtype/rng config + * \param shapes input/output shapes, which would be passed as + * arguments to Opr::deduce_layout + * + * Checker would construct TensorLayout vectors from shapes and dtypes, + * and call exec(TensorLayoutArray &). + */ + TaskRecordChecker& exec(const TensorShapeArray& shapes) { + exec(make_layouts(shapes)); + return *this; + } + + void exec(TensorLayoutArray layouts); + + //! explicitly require argument to be TensorShape + TaskRecordChecker& execs(const TensorShapeArray& shapes) { return exec(shapes); } + + //! explicitly require argument to be TensorLayout + TaskRecordChecker& execl(const TensorLayoutArray& layouts) { + exec(layouts); + return *this; + } + + TaskRecordChecker& set_param(Param p) { + m_param = p; + opr()->param() = p; + return *this; + } + TaskRecordChecker& set_dtype(size_t idx, DType dtype) { + m_dtype[idx] = dtype; + return *this; + } + TaskRecordChecker& set_rng(size_t idx, RNG* rng) { + m_rng[idx] = rng; + return *this; + } + + TaskRecordChecker& set_epsilon(dt_float32 epsilon) { + m_epsilon = epsilon; + m_max_avg_error = epsilon; + m_max_avg_biased_error = epsilon; + return *this; + } + + TaskRecordChecker& set_proxy(const Proxy& proxy) { + m_naive_proxy = proxy; + m_cur_proxy = proxy; + return *this; + } + + //! get the opr impl so setting other than param() can be modified + Opr* opr() { + if (!m_opr_cur) { + m_opr_cur = m_handle->create_operator(); + } + return m_opr_cur.get(); + } + + void free_opr() { + if (m_opr_cur) { + m_opr_cur.reset(); + } + } + + Handle* get_handle() { + megdnn_assert(m_handle); + return m_handle.get(); + } + + void copy_tensors( + const CheckerHelper::TensorValueArray& dest, + const CheckerHelper::TensorValueArray& src) { + megdnn_assert(dest.size() == src.size()); + for (size_t i = 0; i < src.size(); i++) { + auto&& tensor = src[i]; + if (tensor.layout.ndim == 0) + continue; + auto layout = tensor.layout; + auto span = layout.span(); + auto dst_ptr = static_cast(dest[i].raw_ptr()) + span.low_byte; + auto src_ptr = + static_cast(src[i].raw_ptr()) + span.low_byte; + memcpy(dst_ptr, src_ptr, span.dist_byte()); + } + } + +private: + Param m_param; + Proxy m_proxy; + std::unique_ptr m_opr_cur; + std::shared_ptr m_tensors_first, m_tensors_second, + m_tensors_truth; + + std::vector m_recovery_ptrs; + + void init_host_values(); + + void change_tensor_ptr( + std::shared_ptr des, + std::shared_ptr src, std::vector&); + + void recovery_tensor_ptr( + std::shared_ptr src, const std::vector&); +}; + +template +void TaskRecordChecker::exec(TensorLayoutArray layouts) { + auto opr_cur = this->opr(); + opr_cur->param() = m_param; + + m_proxy.deduce_layout(opr_cur, layouts); + for (size_t i = 0; i < layouts.size(); ++i) { + if (layouts[i].dtype == dtype::Byte()) { + layouts[i] = TensorLayout(layouts[i], dtype::Int8()); + } + } + + // allocate input + m_tensors_truth = alloc_tensors(m_handle.get(), layouts, 0); + m_tensors_first = alloc_tensors(m_handle.get(), layouts, 0); + m_tensors_second = alloc_tensors(m_handle.get(), layouts, 0); + + init_host_values(); + + copy_tensors(*m_tensors_first, *m_tensors_truth); + copy_tensors(*m_tensors_second, *m_tensors_truth); + + m_dispatcher->enable_execute_inplace(); + m_proxy.exec(opr_cur, *m_tensors_truth); + + m_dispatcher->clear_task(); + m_dispatcher->disable_execute_inplace(); + //! record the task + m_proxy.exec(opr_cur, *m_tensors_first); + m_dispatcher->run_task(); + + //! if check record2, the opr should be free + // free_opr(); + check_tensors(*m_tensors_truth, *m_tensors_first); + + //! change the src and out ptr and run again + change_tensor_ptr(m_tensors_first, m_tensors_second, m_recovery_ptrs); + m_dispatcher->run_task(); + check_tensors(*m_tensors_truth, *m_tensors_second); + + m_dispatcher->clear_task(); + recovery_tensor_ptr(m_tensors_first, m_recovery_ptrs); + m_recovery_ptrs.clear(); +} + +template +void TaskRecordChecker::init_host_values() { + for (size_t i = 0; i < m_tensors_truth->size(); ++i) { + auto&& tensor = (*m_tensors_truth)[i]; + auto rng = m_rng[i]; + if (!rng) + rng = m_default_rng.get(); + rng->gen(tensor); + } +} +template +void TaskRecordChecker::change_tensor_ptr( + std::shared_ptr des, std::shared_ptr src, + std::vector& recovery_ptrs) { + for (size_t i = 0; i < des->size(); ++i) { + auto&& tensor_dest = (*des)[i]; + auto&& tensor_src = (*src)[i]; + megdnn_assert(tensor_dest.layout.eq_layout(tensor_src.layout)); + recovery_ptrs.push_back(tensor_dest.raw_ptr()); + tensor_dest.reset_ptr(tensor_src.raw_ptr()); + } +} + +template +void TaskRecordChecker::recovery_tensor_ptr( + std::shared_ptr src, + const std::vector& recovery_ptrs) { + megdnn_assert(src->size() == recovery_ptrs.size()); + for (size_t i = 0; i < src->size(); ++i) { + auto&& tensor_src = (*src)[i]; + tensor_src.reset_ptr(recovery_ptrs[i]); + } +} + +} // namespace test +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/common/tensor.cpp b/dnn/test/common/tensor.cpp index cab32604..462cc8b5 100644 --- a/dnn/test/common/tensor.cpp +++ b/dnn/test/common/tensor.cpp @@ -30,12 +30,11 @@ void test::init_gaussian( std::shared_ptr test::make_tensor_h2d( Handle* handle, const TensorND& htensor) { auto span = htensor.layout.span(); - TensorND ret{nullptr, htensor.layout}; uint8_t* mptr = static_cast(megdnn_malloc(handle, span.dist_byte())); megdnn_memcpy_H2D( - handle, mptr, static_cast(htensor.raw_ptr) + span.low_byte, + handle, mptr, static_cast(htensor.raw_ptr()) + span.low_byte, span.dist_byte()); - ret.raw_ptr = mptr + span.low_byte; + TensorND ret{mptr + span.low_byte, htensor.layout}; auto deleter = [handle, mptr](TensorND* p) { megdnn_free(handle, mptr); delete p; @@ -46,11 +45,10 @@ std::shared_ptr test::make_tensor_h2d( std::shared_ptr test::make_tensor_d2h( Handle* handle, const TensorND& dtensor) { auto span = dtensor.layout.span(); - TensorND ret{nullptr, dtensor.layout}; auto mptr = new uint8_t[span.dist_byte()]; - ret.raw_ptr = mptr + span.low_byte; + TensorND ret{mptr + span.low_byte, dtensor.layout}; megdnn_memcpy_D2H( - handle, mptr, static_cast(dtensor.raw_ptr) + span.low_byte, + handle, mptr, static_cast(dtensor.raw_ptr()) + span.low_byte, span.dist_byte()); auto deleter = [mptr](TensorND* p) { delete[] mptr; diff --git a/dnn/test/common/tensor.inl b/dnn/test/common/tensor.inl index d6a2d073..8023acb8 100644 --- a/dnn/test/common/tensor.inl +++ b/dnn/test/common/tensor.inl @@ -24,13 +24,13 @@ Tensor::Tensor(Handle* handle, TensorLayout layout) : m_handle(handle), m_comparator(C()) { if (!layout.dtype.valid()) layout.dtype = get_dtype_from_static_type(); - m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); - m_tensornd.layout = layout; + auto raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); + m_tensornd = TensorND{raw_ptr, layout}; } template Tensor::~Tensor() { - megdnn_free(m_handle, m_tensornd.raw_ptr); + megdnn_free(m_handle, m_tensornd.raw_ptr()); } template diff --git a/dnn/test/common/topk.cpp b/dnn/test/common/topk.cpp index 1d4bcc53..bce05e67 100644 --- a/dnn/test/common/topk.cpp +++ b/dnn/test/common/topk.cpp @@ -23,7 +23,7 @@ class EqualValueRng final : public RNG { public: void gen(const TensorND& tensor) override { - memset(tensor.raw_ptr, 0, tensor.layout.span().dist_byte()); + memset(tensor.raw_ptr(), 0, tensor.layout.span().dist_byte()); ASSERT_EQ(2u, tensor.layout.ndim); size_t m = tensor.layout[0], n = tensor.layout[1]; for (size_t i = 0; i < m; ++i) { diff --git a/dnn/test/common/utils.cpp b/dnn/test/common/utils.cpp index fdfc5395..1f1facae 100644 --- a/dnn/test/common/utils.cpp +++ b/dnn/test/common/utils.cpp @@ -139,7 +139,7 @@ std::shared_ptr DynOutMallocPolicyImpl::make_output_refholder( const TensorND& out) { using namespace std::placeholders; auto deleter = std::bind(megdnn_free, m_handle, _1); - return {out.raw_ptr, deleter}; + return {out.raw_ptr(), deleter}; } NaivePitchAlignmentScope::NaivePitchAlignmentScope(size_t alignment) diff --git a/dnn/test/common/utils.h b/dnn/test/common/utils.h index 4612ff92..f70418ef 100644 --- a/dnn/test/common/utils.h +++ b/dnn/test/common/utils.h @@ -171,9 +171,6 @@ public: ~CpuDispatchChecker() { if (!std::uncaught_exception()) { megdnn_assert(!m_recursive_dispatch); -#if !MEGDNN_NO_THREAD - megdnn_assert(m_nr_call && "cpu dispatch must be called"); -#endif } else { if (m_recursive_dispatch) { fprintf(stderr, diff --git a/dnn/test/common/warp_perspective.cpp b/dnn/test/common/warp_perspective.cpp index ab3f1991..124e85ea 100644 --- a/dnn/test/common/warp_perspective.cpp +++ b/dnn/test/common/warp_perspective.cpp @@ -13,6 +13,7 @@ #include "test/common/warp_perspective.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" using namespace megdnn; using namespace test; @@ -142,6 +143,46 @@ void warp_perspective::run_mat_idx_test(Handle* handle) { checker.execs({{N_SRC, 10, 11, 3}, {2, 3, 3}, {2}, {2, 11, 12, 3}}); } +void warp_perspective::run_int8_test_record(int debug_level) { + using Param = WarpPerspective::Param; + TaskRecordChecker checker(debug_level); + UniformIntRNG input_rng{-128, 127}; + WarpPerspectiveMatRNG mat_rng; + class ResizeBy2xMatRNG : public RNG { + void gen(const TensorND& tensor_) override { + float* ptr = tensor_.ptr(); + auto N = tensor_.layout.shape[0]; + megdnn_assert( + tensor_.layout.is_contiguous() && tensor_.layout.ndim == 3 && + tensor_.layout[1] == 3 && tensor_.layout[2] == 3); + for (size_t n = 0; n < N; ++n) { + // | 1 0 0 | + // mat = | 0 1 0 | + // | 0 0 2 | + // resize_2x + ptr[0] = ptr[4] = 1; + ptr[8] = 2; + ptr[1] = ptr[2] = ptr[3] = ptr[5] = ptr[6] = ptr[7] = 0; + ptr += 9; + } + } + } resize_2x_mat_rng; + checker.set_rng(0, &input_rng) + .set_rng(1, &mat_rng) + .set_dtype(0, dtype::Int8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Int8()) + .set_param( + {Param::InterpolationMode::LINEAR, Param::BorderMode::CONSTANT, + Param::Format::NCHW, 0.f}); + checker.execs({{99, 48, 17, 17}, {99, 3, 3}, {99, 48, 22, 22}}) + .execs({{12, 3, 224, 224}, {12, 3, 3}, {12, 3, 256, 256}}); + + checker.set_rng(1, &resize_2x_mat_rng); + checker.execs({{98, 48, 17, 17}, {98, 3, 3}, {98, 48, 34, 34}}) + .execs({{13, 3, 224, 224}, {13, 3, 3}, {13, 3, 448, 448}}); +} + void warp_perspective::run_int8_test(Handle* handle) { using Param = WarpPerspective::Param; Checker checker(handle); diff --git a/dnn/test/common/warp_perspective.h b/dnn/test/common/warp_perspective.h index 4baaba35..c4a9792f 100644 --- a/dnn/test/common/warp_perspective.h +++ b/dnn/test/common/warp_perspective.h @@ -130,6 +130,7 @@ std::vector get_cv_args(); void run_mat_idx_test(Handle* handle); void run_int8_test(Handle* handle); +void run_int8_test_record(int debug_level); void run_quint8_test(Handle* handle); } // namespace warp_perspective diff --git a/dnn/test/cuda/adaptive_pooling.cpp b/dnn/test/cuda/adaptive_pooling.cpp index 209d33a0..fe2d0b1b 100644 --- a/dnn/test/cuda/adaptive_pooling.cpp +++ b/dnn/test/cuda/adaptive_pooling.cpp @@ -57,8 +57,8 @@ TEST_F(CUDA, ADAPTIVE_POOLING_BACKWARD) { auto&& tensors_cuda = *tensors_cuda_storage; auto span = tensors_cuda[0].layout.span(); - auto dst = static_cast(tensors_cuda[0].raw_ptr) + span.low_byte; - auto src = static_cast(tensors_orig[0].raw_ptr) + + auto dst = static_cast(tensors_cuda[0].raw_ptr()) + span.low_byte; + auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte()); @@ -70,8 +70,9 @@ TEST_F(CUDA, ADAPTIVE_POOLING_BACKWARD) { megdnn_free(handle_cuda(), workspace_cuda); span = tensors_cuda[1].layout.span(); - dst = static_cast(tensors_orig[1].raw_ptr) + span.low_byte; - src = static_cast(tensors_cuda[1].raw_ptr) + span.low_byte; + dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; + src = static_cast(tensors_cuda[1].raw_ptr()) + + span.low_byte; megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte()); }; diff --git a/dnn/test/cuda/bn.cpp b/dnn/test/cuda/bn.cpp index baed2bca..92540aba 100644 --- a/dnn/test/cuda/bn.cpp +++ b/dnn/test/cuda/bn.cpp @@ -29,7 +29,7 @@ TEST_F(CUDA, BN_FORWARD_BACKWARD) { using cuda::cudnn_handle; using cuda::batch_normalization::BNTensorDescHolder; using cuda::batch_normalization::get_reserve_size; - std::vector args = get_args(); + std::vector args = batch_normalization::get_args(); Checker checker(handle_cuda()); Checker checker_bwd(handle_cuda()); for (auto&& arg : args) { diff --git a/dnn/test/cuda/checksum.cpp b/dnn/test/cuda/checksum.cpp index 9d6eda63..e3f5821b 100644 --- a/dnn/test/cuda/checksum.cpp +++ b/dnn/test/cuda/checksum.cpp @@ -24,7 +24,7 @@ TEST_F(CUDA, CHECKSUM_FORWARD) { auto aligned_size = size + ((512 - size % 512) % 512); auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { TensorND tensor; - tensor.raw_ptr = ptr; + tensor.reset_ptr(ptr); tensor.layout.init_contiguous_stride({size}); tensor.layout.dtype = dtype::Byte(); WorkspaceWrapper workspace( diff --git a/dnn/test/cuda/elemwise.cpp b/dnn/test/cuda/elemwise.cpp index 97027e2a..ae4b5fe5 100644 --- a/dnn/test/cuda/elemwise.cpp +++ b/dnn/test/cuda/elemwise.cpp @@ -52,7 +52,7 @@ void run_tensor_add( cudnn_check(cudnnCreate(&cudnn_handle)); cuda_check(cudaDeviceSynchronize()); cuda_check(cudaMemcpy( - c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(), + c.raw_ptr(), a.raw_ptr(), a.layout.span().dist_byte(), cudaMemcpyDeviceToDevice)); auto bdesc = make_cudnn_tensor_desc(b.layout), @@ -61,14 +61,14 @@ void run_tensor_add( float alpha = 1, beta = 1; cudaProfilerStart(); cudnn_check(cudnnAddTensor( - cudnn_handle, &alpha, bdesc, b.raw_ptr, &beta, cdesc, c.raw_ptr)); + cudnn_handle, &alpha, bdesc, b.raw_ptr(), &beta, cdesc, c.raw_ptr())); cudaProfilerStop(); cudnn_check(cudnnDestroyTensorDescriptor(cdesc)); cudnn_check(cudnnDestroyTensorDescriptor(bdesc)); cudnn_check(cudnnDestroy(cudnn_handle)); - cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte())); + cuda_check(cudaMemset(c.raw_ptr(), 0, c.layout.span().dist_byte())); cuda_check(cudaDeviceSynchronize()); #endif diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp index 2c4187d6..4fb03048 100644 --- a/dnn/test/cuda/pooling.cpp +++ b/dnn/test/cuda/pooling.cpp @@ -108,8 +108,8 @@ TEST_F(CUDA, POOLING_BACKWARD) { auto&& tensors_cuda = *tensors_cuda_storage; auto span = tensors_cuda[0].layout.span(); - auto dst = static_cast(tensors_cuda[0].raw_ptr) + span.low_byte; - auto src = static_cast(tensors_orig[0].raw_ptr) + + auto dst = static_cast(tensors_cuda[0].raw_ptr()) + span.low_byte; + auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte()); @@ -121,8 +121,9 @@ TEST_F(CUDA, POOLING_BACKWARD) { megdnn_free(handle_cuda(), workspace_cuda); span = tensors_cuda[1].layout.span(); - dst = static_cast(tensors_orig[1].raw_ptr) + span.low_byte; - src = static_cast(tensors_cuda[1].raw_ptr) + span.low_byte; + dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; + src = static_cast(tensors_cuda[1].raw_ptr()) + + span.low_byte; megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte()); }; @@ -174,8 +175,8 @@ TEST_F(CUDA, POOLING_BACKWARD) { auto&& tensors_cuda = *tensors_cuda_storage; auto span = tensors_cuda[0].layout.span(); - auto dst = static_cast(tensors_cuda[0].raw_ptr) + span.low_byte; - auto src = static_cast(tensors_orig[0].raw_ptr) + + auto dst = static_cast(tensors_cuda[0].raw_ptr()) + span.low_byte; + auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte()); @@ -187,8 +188,9 @@ TEST_F(CUDA, POOLING_BACKWARD) { megdnn_free(handle_cuda(), workspace_cuda); span = tensors_cuda[1].layout.span(); - dst = static_cast(tensors_orig[1].raw_ptr) + span.low_byte; - src = static_cast(tensors_cuda[1].raw_ptr) + span.low_byte; + dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; + src = static_cast(tensors_cuda[1].raw_ptr()) + + span.low_byte; megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte()); }; diff --git a/dnn/test/fallback/concat.cpp b/dnn/test/fallback/concat.cpp index b565b842..69eede6e 100644 --- a/dnn/test/fallback/concat.cpp +++ b/dnn/test/fallback/concat.cpp @@ -11,7 +11,7 @@ #include "test/fallback/fixture.h" #include "test/common/checker.h" - +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -35,6 +35,21 @@ TEST_F(FALLBACK, CONCAT) { } } } +TEST_F(FALLBACK, CONCAT_RECORD) { + TaskRecordChecker checker(1); + using Param = Concat::Param; + + Param param; + param.axis = 0; + TensorShapeArray shapes(4, TensorShape({12, 13, 14, 15})); + for (size_t i = 0; i < 4; ++i) { + shapes[i].shape[0] = i + 1; + } + shapes.emplace_back(); + for (size_t i = 0; i < shapes.size(); ++i) + checker.set_dtype(i, dtype::Float32()); + checker.set_param(param).exec(shapes); +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/conv_bias.cpp b/dnn/test/fallback/conv_bias.cpp index 6c8480b6..7b20108a 100644 --- a/dnn/test/fallback/conv_bias.cpp +++ b/dnn/test/fallback/conv_bias.cpp @@ -15,9 +15,9 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/fallback/fixture.h" - #if MEGDNN_X86 #include "src/x86/utils.h" #endif @@ -77,6 +77,46 @@ TEST_F(FALLBACK, CONV_BIAS_FORWARD) { } } +TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) { + using namespace conv_bias; + TaskRecordChecker checker(1); + NormalRNG default_rng; + UniformIntRNG int_rng{-50, 50}; + param::ConvBias param; + { + param.format = param::ConvBias::Format::NHWC; + auto src_shape = TensorShape{2, 16, 32, 24}; + auto filter_shape = TensorShape{4, 3, 3, 24}; + auto bias_shape_channel = TensorShape{1, 1, 1, 4}; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_rng(2, &default_rng) + .set_param(param) + .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); + } + + { + param.format = param::ConvBias::Format::NCHW; + param.sparse = ConvBias::Param::Sparse::GROUP; + auto src_shape = TensorShape{2, 16, 32, 24}; + auto filter_shape = TensorShape{4, 4, 4, 1, 1}; + auto bias_shape_channel = TensorShape{1, 16, 1, 1}; + auto bias_shape = TensorShape{2, 16, 32, 24}; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_rng(2, &default_rng) + .set_param(param) + .execs({src_shape, filter_shape, bias_shape, {}, {}}) + .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); + } +} + std::vector get_conv_bias_args( std::vector kernel, std::vector padv, std::vector nlmodev, std::vector stridev, diff --git a/dnn/test/fallback/convolution.cpp b/dnn/test/fallback/convolution.cpp index 06296556..84515f10 100644 --- a/dnn/test/fallback/convolution.cpp +++ b/dnn/test/fallback/convolution.cpp @@ -17,10 +17,32 @@ #include "test/common/convolution.h" #include "test/common/rng.h" - +#include "test/common/task_record_check.h" using namespace megdnn; using namespace test; - +namespace megdnn { +namespace test { +TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_RECORD) { + using Param = Convolution::Param; + TaskRecordChecker checker(1); + NormalRNG default_rng; + UniformIntRNG int_rng{-50, 50}; + Param param; + param.stride_h = 2; + param.stride_w = 2; + param.pad_h = 3 / 2; + param.pad_w = 3 / 2; + param.pad_h = 0; + param.pad_w = 0; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_param(param) + .execs({{1, 3, 20, 40}, {24, 3, 3, 3}, {}}); +} +} // namespace test +} // namespace megdnn #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL) { @@ -452,6 +474,53 @@ TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA) { } } +TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_RECORD) { + TaskRecordChecker checker(1); + using Param = ConvolutionBackwardData::Param; + + Param param; + auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh, + size_t fw, size_t stride, size_t padding, size_t dilate = 1, + size_t group = 1) { + param.pad_h = param.pad_w = padding; + param.stride_h = param.stride_w = stride; + param.dilate_h = param.dilate_w = dilate; + + TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float32()}; + TensorLayout grad; + TensorLayout filter; + if (group == 1) { + param.sparse = Param::Sparse::DENSE; + filter = {{oc, ic, fh, fw}, dtype::Float32()}; + } else { + param.sparse = Param::Sparse::GROUP; + filter = {{group, oc, ic, fh, fw}, dtype::Float32()}; + } + // TensorLayout grad; + { + auto opr = handle()->create_operator(); + opr->param() = param; + opr->deduce_layout(filter, diff, grad); + } + checker.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()); + checker.exec(TensorLayoutArray{filter, diff, grad}); + }; + + for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) { + param.mode = mode; + run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1); + run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2); + run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3); + run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2); + run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3); + run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2); + run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3); + run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2); + } +} + TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_INT8_INT8_INT32) { Checker checker(handle()); using Param = ConvolutionBackwardData::Param; diff --git a/dnn/test/fallback/elemwise.cpp b/dnn/test/fallback/elemwise.cpp index 08541fe0..0f5e3c37 100644 --- a/dnn/test/fallback/elemwise.cpp +++ b/dnn/test/fallback/elemwise.cpp @@ -11,12 +11,12 @@ #include "test/fallback/fixture.h" +#include #include "test/common/checker.h" #include "test/common/elemwise.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" -#include - using namespace megdnn; using namespace test; @@ -26,6 +26,18 @@ TYPED_TEST_CASE(FALLBACK_ELEMWISE, elemwise::test_types); TYPED_TEST(FALLBACK_ELEMWISE, run) { elemwise::run_test(this->handle()); } +TEST_F(FALLBACK, ELEMWISE_RECORD) { + TaskRecordChecker checker{1}; + checker.set_param({Elemwise::Mode::ADD}); + checker.set_dtype(0, dtype::Float32()); + checker.set_dtype(1, dtype::Float32()); + checker.set_dtype(2, dtype::Float32()); + UniformIntRNG rng{-100, 100}; + checker.set_rng(0, &rng); + checker.set_rng(1, &rng); + checker.set_rng(2, &rng); + checker.execs({{10, 10, 32}, {10, 10, 32}, {}}); +} #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, BENCHMARK_ELEMWISE) { auto naive_handle = create_cpu_handle(2); diff --git a/dnn/test/fallback/elemwise_multi_type.cpp b/dnn/test/fallback/elemwise_multi_type.cpp index aac87053..a51767ce 100644 --- a/dnn/test/fallback/elemwise_multi_type.cpp +++ b/dnn/test/fallback/elemwise_multi_type.cpp @@ -11,8 +11,8 @@ #include "test/common/elemwise_multi_type.h" #include "test/common/benchmarker.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - using namespace megdnn; using namespace test; @@ -25,6 +25,21 @@ TYPED_TEST_CASE(FALLBACK_ELEMWISE_MULTI_TYPE, elemwise_multi_type::test_types); TYPED_TEST(FALLBACK_ELEMWISE_MULTI_TYPE, run) { elemwise_multi_type::run_test(this->handle()); } + +TEST_F(FALLBACK, ELEMWISE_MULTI_TYPE_RECORD_FMA3_INT16x32x32x32) { + TaskRecordChecker checker{1}; + checker.set_param({ElemwiseMultiType::Mode::FUSE_MUL_ADD3_INT16x32x32x32}); + checker.set_dtype(0, dtype::Int16()); + checker.set_dtype(1, dtype::Int32()); + checker.set_dtype(2, dtype::Int32()); + UniformIntRNG rng{-10, 10}; + checker.set_rng(0, &rng); + checker.set_rng(1, &rng); + checker.set_rng(2, &rng); + constexpr size_t A = 32, B = 602, C = 103; + checker.execs({{A, B, C}, {1, B, 1}, {1, B, 1}, {}}); +} + #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, ELEMWISE_MULTI_TYPE_BENCHMARK_FMA3_INT16x32x32x32) { Benchmarker bench{handle()}; diff --git a/dnn/test/fallback/flip.cpp b/dnn/test/fallback/flip.cpp index c911fab2..78b26070 100644 --- a/dnn/test/fallback/flip.cpp +++ b/dnn/test/fallback/flip.cpp @@ -15,9 +15,9 @@ #include "megdnn/oprs.h" #include "test/common/checker.h" #include "test/common/flip.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -32,6 +32,17 @@ TEST_F(FALLBACK, FLIP) { checker.execs({arg.src, {}}); } } +TEST_F(FALLBACK, FLIP_RECORD) { + using namespace flip; + std::vector args = get_args(); + TaskRecordChecker checker(0); + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + + for (auto&& arg : args) { + checker.execs({arg.src, {}}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/gaussian_blur.cpp b/dnn/test/fallback/gaussian_blur.cpp index aec65830..f31d6bbc 100644 --- a/dnn/test/fallback/gaussian_blur.cpp +++ b/dnn/test/fallback/gaussian_blur.cpp @@ -10,8 +10,8 @@ */ #include "test/common/gaussian_blur.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -35,7 +35,26 @@ TEST_F(FALLBACK, GAUSSIAN_BLUR) { .execs({arg.src, {}}); } } +TEST_F(FALLBACK, GAUSSIAN_BLUR_RECORD) { + using namespace gaussian_blur; + std::vector args = get_args(); + TaskRecordChecker checker(1); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, {}}); + } + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_epsilon(1 + 1e-3) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .execs({arg.src, {}}); + } +} } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/fallback/matrix_mul.cpp b/dnn/test/fallback/matrix_mul.cpp index f1d65d88..63814628 100644 --- a/dnn/test/fallback/matrix_mul.cpp +++ b/dnn/test/fallback/matrix_mul.cpp @@ -11,8 +11,8 @@ #include "test/common/matrix_mul.h" #include "test/common/checker.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -45,6 +45,35 @@ TEST_F(FALLBACK, MATRIX_MUL) { checker.execl({AL, BL, CL}); } } +TEST_F(FALLBACK, MATRIX_MUL_RECORD) { + TaskRecordChecker checker(1); + using Param = MatrixMul::Param; + auto args = matrix_mul::get_matmul_args(); + for (auto arg : args) { + auto m = arg.m, n = arg.n, k = arg.k; + auto mask = arg.mask; + Param param; + param.transposeA = mask & 1; + param.transposeB = mask & 2; + TensorShape AS, BS, CS; + + if (param.transposeA) + AS = TensorShape{k, m}; + else + AS = TensorShape{m, k}; + if (param.transposeB) + BS = TensorShape{n, k}; + else + BS = TensorShape{k, n}; + CS = TensorShape{m, n}; + TensorLayout AL, BL, CL; + AL = TensorLayout(AS, dtype::Float32()); + BL = TensorLayout(BS, dtype::Float32()); + CL = TensorLayout(CS, dtype::Float32()); + checker.set_param(param); + checker.execl({AL, BL, CL}); + } +} TEST_F(FALLBACK, MATRIX_MUL_NAIVE_MK4) { matrix_mul::check_matrix_mul( diff --git a/dnn/test/fallback/powc.cpp b/dnn/test/fallback/powc.cpp index 83ad26ac..4894871e 100644 --- a/dnn/test/fallback/powc.cpp +++ b/dnn/test/fallback/powc.cpp @@ -10,8 +10,8 @@ */ #include "test/common/powc.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - using namespace megdnn; using namespace test; @@ -19,6 +19,70 @@ TEST_F(FALLBACK, POW_C_F32) { run_powc_test(handle(), dtype::Float32{}); } +TEST_F(FALLBACK, POW_C_F32_RECORD) { + TaskRecordChecker checker(1); + auto dtype = dtype::Float32{}; + checker.set_dtype(0, dtype); + + float dt_val_max; + if (dtype == dtype::Float32{}) { + dt_val_max = DTypeTrait::max(); + } else { + megdnn_assert(dtype == dtype::Float16{}); + dt_val_max = DTypeTrait::max(); + checker.set_epsilon(1e-2); + } + + dt_val_max /= 4; + + for (float exp : + {0.f, 1.f / 3.f, 1.f / 3.f + 0.01f, .5f, 1.f, 1.2f, 2.f, 3.f, 4.f, 7.f, 8.f}) { + float rng_max = + exp ? std::pow(dt_val_max, std::min(1.f / exp, 1.f)) : dt_val_max; + bool allow_neg; + { + auto d = exp - std::floor(exp); + if (d >= .1f) { + allow_neg = false; + } else { + allow_neg = true; + } + } + UniformFloatRNG rng0{-rng_max, rng_max}, rng1{0.f, rng_max}; + checker.set_rng(0, allow_neg ? &rng0 : &rng1); + checker.set_param(exp); + checker.execs({TensorShape{23, 34}, {}}); + if (::testing::Test::HasFailure()) { + printf("failed for %g\n", exp); + return; + } + + UniformFloatNonZeroRNG rng2{1.f / rng_max, dt_val_max}; + UniformFloatRNG rng3{1.f / rng_max, dt_val_max}; + if (allow_neg) { + checker.set_rng(0, &rng2); + } else { + checker.set_rng(0, &rng3); + } + checker.set_param(-exp); + checker.execs({TensorShape{3, 7, 2}, {}}); + if (::testing::Test::HasFailure()) { + printf("failed for %g\n", -exp); + return; + } + + // non contig + TensorLayout layout{{4, 9}, dtype}; + layout.stride[0] *= 3; + layout.stride[1] *= 2; + checker.execl({layout, {}}); + if (::testing::Test::HasFailure()) { + printf("failed for %g noncontig\n", -exp); + return; + } + } +} + #if !MEGDNN_DISABLE_FLOAT16 TEST_F(FALLBACK, POW_C_F16) { run_powc_test(handle(), dtype::Float16{}); diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp index 8b393411..0a8de3b6 100644 --- a/dnn/test/fallback/reduce.cpp +++ b/dnn/test/fallback/reduce.cpp @@ -12,9 +12,9 @@ #include "megdnn/oprs.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/common/workspace_wrapper.h" - using namespace megdnn; using namespace test; @@ -130,4 +130,116 @@ TEST_F(FALLBACK, REDUCE) { } } +TEST_F(FALLBACK, REDUCE_RECORD) { + using Param = Reduce::Param; + using Mode = Param::Mode; + using DataType = Param::DataType; + TaskRecordChecker checker(1); + struct Config { + Param param; + DType dtype; + TensorShape shape; + Config(Param param, DType dtype, TensorShape shape) + : param(param), dtype(dtype), shape(shape) {} + }; + std::vector configs; + // general + for (auto mode : + {Mode::SUM, Mode::MEAN, Mode::SUM_SQR, Mode::PRODUCT, Mode::MIN, Mode::MAX}) + for (auto dtype : std::vector{ + dtype::Float16(), dtype::Float32(), dtype::Int32(), dtype::Int16(), + dtype::Int8(), dtype::Uint8()}) + for (int32_t axis : {0, 1, 2, 3}) { + TensorShape shape{2, 3, 20, 5}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + if (dtype.category() == DTypeCategory::FLOAT) { + Param param(mode, axis, DataType::FLOAT_O16xC32); + Config config(param, dtype, shape); + configs.push_back(config); + + param.data_type = DataType::FLOAT_O32xC32; + config = Config(param, dtype, shape); + configs.push_back(config); + } else if (dtype == dtype::Int32()) { + Param param(mode, axis, DataType::FLOAT_O32xC32); + Config config(param, dtype, shape); + configs.push_back(config); + } + } + // large (ABC) -> (A1C) case + for (auto mode : {Mode::SUM_SQR}) + for (auto dtype : std::vector{dtype::Int32()}) + for (int32_t axis : {0, 1, 2, 3}) { + TensorShape shape{2, 3, 10000, 5}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + // large (AB) -> (A1) case + for (auto mode : {Mode::SUM_SQR}) + for (auto dtype : std::vector{dtype::Int32()}) + for (int32_t axis : {0, 1, 2, 3}) { + TensorShape shape{2, 3, 5, 10000}; + Param param(mode, axis); + Config config(param, dtype, shape); + configs.push_back(config); + } + + { + // large reduce_mean for O16C32 + TensorShape shape{1, 65536, 5}; + Param param(Mode::MEAN, 1, DataType::FLOAT_O16xC32); + Config config(param, dtype::Float16(), shape); + configs.push_back(config); + } + + for (auto&& config : configs) { + auto&& dtype = config.dtype; + auto&& param = config.param; + auto&& mode = config.param.mode; + auto&& shape = config.shape; + auto&& data_type = config.param.data_type; + // when input/output both float16, the internal compute is float16, mode + // is SUM or SUM_SQR, need set epsilon to 1e-2 to pass test + if (dtype == dtype::Float16() && data_type == DataType::DEFAULT && + (mode == Mode::SUM || mode == Mode::SUM_SQR)) { + checker.set_epsilon(1e-2); + } + + checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); + } + { + static size_t N = 1 << 26; + { + // cpu vs naive + TaskRecordChecker checker(1); + Reduce::Param param; + param.axis = 0; + UniformFloatRNG rng(1, 1); + checker.set_param(param); + checker.set_rng(0, &rng); + checker.execs({{N}, {}}); + } + { + // naive vs groundtruth + TensorLayout layoutN(TensorShape{N}, dtype::Float32()), + layout1(TensorShape{1}, dtype::Float32()); + auto handle = this->handle(); + Tensor src(handle, layoutN), dst(handle, layout1); + float* ptr = src.ptr(); + for (size_t i = 0; i < N; ++i) + ptr[i] = 1; + auto opr = handle->create_operator(); + opr->param().axis = 0; + auto wsize = opr->get_workspace_in_bytes(layoutN, layout1); + WorkspaceWrapper workspace(handle, wsize); + opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace()); + megdnn_sync(handle); + ASSERT_EQ(N, dst.ptr()[0]); + } + } +} + // vim: syntax=cpp.doxygen diff --git a/dnn/test/fallback/relayout.cpp b/dnn/test/fallback/relayout.cpp index 0492407f..1f82a3e2 100644 --- a/dnn/test/fallback/relayout.cpp +++ b/dnn/test/fallback/relayout.cpp @@ -15,9 +15,9 @@ #include "test/common/relayout.h" #include "test/common/tensor.h" -#include "megdnn/basic_types.h" - #include +#include "megdnn/basic_types.h" +#include "test/common/task_record_check.h" using namespace megdnn; using namespace test; @@ -30,6 +30,21 @@ TYPED_TEST(FALLBACK_RELAYOUT, run) { relayout::run_test(this->handle()); } } // namespace + +TEST_F(FALLBACK, RELAYOUT_CONTINUE) { + Checker checker(handle()); + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + checker.exec({{2, 2, 2}, {2, 2, 2}}); +} + +TEST_F(FALLBACK, RELAYOUT_RECORD) { + TaskRecordChecker checker(1); + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + checker.exec({{2, 2, 2}, {2, 2, 2}}); +} + #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, BENCHMARK_RELAYOUT_CV) { relayout::run_cv_benchmark(handle()); diff --git a/dnn/test/fallback/repeat.cpp b/dnn/test/fallback/repeat.cpp index 81525452..ad62fc5a 100644 --- a/dnn/test/fallback/repeat.cpp +++ b/dnn/test/fallback/repeat.cpp @@ -11,8 +11,8 @@ #include "test/fallback/fixture.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/common/tile_repeat.h" - namespace megdnn { namespace test { @@ -23,6 +23,13 @@ TEST_F(FALLBACK, REPEAT) { checker.set_param(arg.param()).execs({arg.src, {}}); } } +TEST_F(FALLBACK, REPEAT_RECORD) { + TaskRecordChecker checker(1); + auto args = tile_repeat::get_args(); + for (auto&& arg : args) { + checker.set_param(arg.param()).execs({arg.src, {}}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/resize.cpp b/dnn/test/fallback/resize.cpp index c65fc1a0..f77d410f 100644 --- a/dnn/test/fallback/resize.cpp +++ b/dnn/test/fallback/resize.cpp @@ -10,8 +10,8 @@ */ #include "test/common/resize.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -35,6 +35,26 @@ TEST_F(FALLBACK, RESIZE_CV) { .execs({arg.src, arg.dst}); } } +TEST_F(FALLBACK, RESIZE_CV_RECORD) { + using namespace resize; + std::vector args = get_cv_args(); + TaskRecordChecker checker(1); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .set_epsilon(1 + 1e-3) + .execs({arg.src, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, arg.dst}); + } +} TEST_F(FALLBACK, RESIZE) { using namespace resize; @@ -56,6 +76,26 @@ TEST_F(FALLBACK, RESIZE) { .execs({arg.src, arg.dst}); } } +TEST_F(FALLBACK, RESIZE_RECORD) { + using namespace resize; + std::vector args = get_args(); + TaskRecordChecker checker(1); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .set_epsilon(1 + 1e-3) + .execs({arg.src, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, arg.dst}); + } +} TEST_F(FALLBACK, RESIZE_NCHW_WITH_STRIDE) { param::Resize param; @@ -78,6 +118,27 @@ TEST_F(FALLBACK, RESIZE_NCHW_WITH_STRIDE) { run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype); } } +TEST_F(FALLBACK, RESIZE_NCHW_WITH_STRIDE_RECORD) { + param::Resize param; + param.format = param::Resize::Format::NCHW; + param.imode = param::Resize::InterpolationMode::LINEAR; + TaskRecordChecker checker(1); + checker.set_epsilon(1 + 1e-3).set_param(param); + + auto run = [&](TensorShape src_shape, std::vector src_layout, + TensorShape dst_shape, DType dtype) { + checker.set_dtype(0, dtype).set_dtype(1, dtype).execl( + {{src_shape, src_layout, dtype}, {dst_shape, dtype}}); + }; + + for (DType& dtype : std::vector{dtype::Float32(), dtype::Uint8()}) { + run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype); + run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype); + run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype); + run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype); + run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype); + } +} TEST_F(FALLBACK, RESIZE_NCHW4) { using namespace resize; @@ -92,6 +153,19 @@ TEST_F(FALLBACK, RESIZE_NCHW4) { .execs({arg.src, arg.dst}); } } +TEST_F(FALLBACK, RESIZE_NCHW4_RECORD) { + using namespace resize; + auto args = get_nchw4_args(); + TaskRecordChecker checker(1); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::QuantizedS8(1.0f)) + .set_dtype(1, dtype::QuantizedS8(1.0f)) + .set_epsilon(1 + 1e-3) + .execs({arg.src, arg.dst}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/roi_copy.cpp b/dnn/test/fallback/roi_copy.cpp index 994a2dda..37f98901 100644 --- a/dnn/test/fallback/roi_copy.cpp +++ b/dnn/test/fallback/roi_copy.cpp @@ -16,9 +16,9 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/roi_copy.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -33,6 +33,17 @@ TEST_F(FALLBACK, ROICOPY) { checker.set_param(arg.param).execs({arg.src, {}}); } } +TEST_F(FALLBACK, ROICOPY_RECORD) { + using namespace roi_copy; + std::vector args = get_args(); + TaskRecordChecker checker(1); + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, {}}); + } +} #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, BENCHMARK_ROICOPY) { auto run = [&](const TensorShapeArray& shapes) { diff --git a/dnn/test/fallback/rotate.cpp b/dnn/test/fallback/rotate.cpp index 3aae9295..fc34aeac 100644 --- a/dnn/test/fallback/rotate.cpp +++ b/dnn/test/fallback/rotate.cpp @@ -15,9 +15,9 @@ #include "megdnn/oprs.h" #include "test/common/checker.h" #include "test/common/rotate.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -33,6 +33,18 @@ TEST_F(FALLBACK, ROTATE) { } } +TEST_F(FALLBACK, ROTATE_RECORD) { + using namespace rotate; + std::vector args = get_args(); + TaskRecordChecker checker(1); + checker.set_dtype(0, dtype::Int32()); + checker.set_dtype(1, dtype::Int32()); + + for (auto&& arg : args) { + checker.set_dtype(0, arg.dtype).set_dtype(1, arg.dtype).execs({arg.src, {}}); + } +} + } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/split.cpp b/dnn/test/fallback/split.cpp index 0c82d701..efac9c2e 100644 --- a/dnn/test/fallback/split.cpp +++ b/dnn/test/fallback/split.cpp @@ -11,7 +11,7 @@ #include "test/fallback/fixture.h" #include "test/common/checker.h" - +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -29,6 +29,20 @@ TEST_F(FALLBACK, SPLIT) { checker.set_param(param).exec(shapes); } } +TEST_F(FALLBACK, SPLIT_RECORD) { + TaskRecordChecker checker(1); + using Param = Split::Param; + for (size_t axis = 0; axis < 4; ++axis) { + Param param; + param.axis = axis; + TensorShapeArray shapes(5, TensorShape({2, 3, 4, 5})); + shapes[0].shape[axis] = 10; + for (size_t i = 1; i < 5; ++i) { + shapes[i].shape[axis] = i; + } + checker.set_param(param).exec(shapes); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/tile.cpp b/dnn/test/fallback/tile.cpp index 2045e0cb..0c63966b 100644 --- a/dnn/test/fallback/tile.cpp +++ b/dnn/test/fallback/tile.cpp @@ -11,8 +11,8 @@ #include "test/fallback/fixture.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/common/tile_repeat.h" - namespace megdnn { namespace test { @@ -23,6 +23,13 @@ TEST_F(FALLBACK, TILE) { checker.set_param(arg.param()).execs({arg.src, {}}); } } +TEST_F(FALLBACK, TILE_REOCRD) { + TaskRecordChecker checker(1); + auto args = tile_repeat::get_args(); + for (auto&& arg : args) { + checker.set_param(arg.param()).execs({arg.src, {}}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/type_cvt.cpp b/dnn/test/fallback/type_cvt.cpp index 217f3244..67084dad 100644 --- a/dnn/test/fallback/type_cvt.cpp +++ b/dnn/test/fallback/type_cvt.cpp @@ -11,8 +11,8 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/fallback/fixture.h" - namespace megdnn { namespace test { @@ -41,6 +41,31 @@ TEST_F(FALLBACK, TYPE_CVT) { } } +TEST_F(FALLBACK, TYPE_CVT_RECORD) { + TaskRecordChecker checker(1); + NormalRNG rng(128); + checker.set_rng(0, &rng); + + std::vector dtypes = { + dtype::Float32(), + dtype::Float16(), + dtype::Int32(), + dtype::Int16(), + dtype::Int8(), + dtype::Uint8(), + dtype::QuantizedS8(0.5f), + dtype::QuantizedS32(0.5f), + dtype::Quantized8Asymm(2.0f, static_cast(3))}; + + for (size_t size : {1, 7, 15, 33}) { + for (auto sdtype : dtypes) + for (auto ddtype : dtypes) { + checker.set_dtype(0, sdtype).set_dtype(1, ddtype).execs( + {{size}, {size}}); + } + } +} + #if MEGDNN_WITH_BENCHMARK TEST_F(FALLBACK, BENCHMARK_TYPE_CVT) { auto handle_naive = create_cpu_handle(2); diff --git a/dnn/test/fallback/warp_perspective.cpp b/dnn/test/fallback/warp_perspective.cpp index 96e7267d..34c4c813 100644 --- a/dnn/test/fallback/warp_perspective.cpp +++ b/dnn/test/fallback/warp_perspective.cpp @@ -13,8 +13,8 @@ #include "test/common/checker.h" #include "test/common/random_state.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/warp_perspective.h" - namespace megdnn { namespace test { @@ -92,6 +92,80 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) { #endif } +TEST_F(FALLBACK, WARP_PERSPECTIVE_RECORD) { + TaskRecordChecker checker(1); + param::WarpPerspective param; + class ResizeMatRNG : public RNG { + void gen(const TensorND& tensor_) override { + auto& gen = RandomState::generator(); + std::uniform_real_distribution pdist3(1.9f, 3.1f); + std::uniform_real_distribution pdist(0.9f, 1.1f); + std::uniform_real_distribution pdisth(0.4f, 0.6f); + std::uniform_real_distribution ndist(-1.1f, -0.9f); + std::uniform_real_distribution ndist3(-3.1f, -1.9f); + std::uniform_real_distribution ndisth(-0.6f, -0.4f); + std::uniform_int_distribution dice(0, 5); + float* ptr = tensor_.ptr(); + auto N = tensor_.layout.shape[0]; + for (size_t n = 0; n < N; ++n) { + for (size_t i = 0; i < 9; ++i) { + switch (dice(gen)) { + case 0: + ptr[i] = pdist3(gen); + break; + case 1: + ptr[i] = pdist(gen); + break; + case 2: + ptr[i] = pdisth(gen); + break; + case 3: + ptr[i] = ndist(gen); + break; + case 4: + ptr[i] = ndist3(gen); + break; + case 5: + ptr[i] = ndisth(gen); + break; + } + } + // is resize? + if (n & 1) { + ptr[1] = 0; + ptr[3] = 0; + ptr[6] = ptr[7] = 0; + } + ptr += 9; + } + } + } rng; + checker.set_rng(1, &rng); + using BMode = param::WarpPerspective::BorderMode; + param.imode = param::WarpPerspective::InterpolationMode::LINEAR; + // for (auto mode : + // {BMode::REFLECT_101, BMode::REPLICATE, BMode::REFLECT, BMode::WRAP, + // BMode::CONSTANT}) { + param.bmode = BMode::REFLECT_101; + param.border_val = 1.737; + checker.set_param(param); + checker.exec({{1, 2, 10, 11}, {1, 3, 3}, {1, 2, 12, 13}}); + // } +#if MEGDNN_TEST_ASAN +//! asan detect nan will make test failed +#else + // resize nan case + UniformFloatRNG rng_zero(0, 0); + checker.set_rng(1, &rng_zero); + { + param.bmode = BMode::CONSTANT; + param.border_val = 1.737; + checker.set_param(param); + checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}}); + } +#endif +} + TEST_F(FALLBACK, WARP_PERSPECTIVE_MAT_IDX) { warp_perspective::run_mat_idx_test(handle()); } @@ -100,6 +174,10 @@ TEST_F(FALLBACK, WARP_PERSPECTIFVE_NCHW_INT8) { warp_perspective::run_int8_test(handle()); } +TEST_F(FALLBACK, WARP_PERSPECTIFVE_NCHW_INT8_RECORD) { + warp_perspective::run_int8_test_record(1); +} + TEST_F(FALLBACK, WARP_PERSPECTIFVE_NCHW_QUINT8) { warp_perspective::run_quint8_test(handle()); } diff --git a/dnn/test/naive/conv_bias.cpp b/dnn/test/naive/conv_bias.cpp index 5bde5775..eb7f517a 100644 --- a/dnn/test/naive/conv_bias.cpp +++ b/dnn/test/naive/conv_bias.cpp @@ -22,10 +22,10 @@ namespace { class TensorWrapper { public: TensorWrapper(Handle* handle, TensorLayout layout) : m_handle(handle) { - m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); - m_tensornd.layout = layout; + auto raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); + m_tensornd = TensorND{raw_ptr, layout}; } - ~TensorWrapper() { megdnn_free(m_handle, m_tensornd.raw_ptr); } + ~TensorWrapper() { megdnn_free(m_handle, m_tensornd.raw_ptr()); } TensorND tensornd() const { return m_tensornd; } @@ -86,9 +86,9 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED4x4x32) { const std::vector& values) { TensorND tensor; tensor.layout = {shape, dtype}; - tensor.raw_ptr = - static_cast(malloc(tensor.layout.span().dist_byte())); - uint8_t* ptr = static_cast(tensor.raw_ptr); + tensor.reset_ptr( + static_cast(malloc(tensor.layout.span().dist_byte()))); + uint8_t* ptr = static_cast(tensor.raw_ptr()); megdnn_assert(values.size() == tensor.layout.span().dist_elem()); for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) { int val0 = values[i], val1 = values[i + 1]; @@ -670,10 +670,9 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED4) { auto GenTensorValue = [](const TensorShape& shape, dtype::QuantizedS32 dtype, std::vector values) { - TensorND tensor; - tensor.layout = {shape, dtype}; - tensor.raw_ptr = - static_cast(malloc(tensor.layout.span().dist_byte())); + TensorLayout layout = {shape, dtype}; + auto raw_ptr = static_cast(malloc(layout.span().dist_byte())); + TensorND tensor{raw_ptr, layout}; megdnn_assert( values.size() == tensor.layout.total_nr_elems(), "%zu == %zu", values.size(), tensor.layout.total_nr_elems()); @@ -3015,10 +3014,9 @@ TEST_F(NAIVE, CONV_BIAS_NCHW64_Q4) { auto GenTensorValue = [](const TensorShape& shape, dtype::QuantizedS32 dtype, std::vector values) { - TensorND tensor; - tensor.layout = {shape, dtype}; - tensor.raw_ptr = - static_cast(malloc(tensor.layout.span().dist_byte())); + TensorLayout layout = {shape, dtype}; + auto raw_ptr = static_cast(malloc(layout.span().dist_byte())); + TensorND tensor{raw_ptr, layout}; megdnn_assert( values.size() == tensor.layout.total_nr_elems(), "%zu == %zu", values.size(), tensor.layout.total_nr_elems()); diff --git a/dnn/test/naive/convolution.cpp b/dnn/test/naive/convolution.cpp index 572a1e36..2798418c 100644 --- a/dnn/test/naive/convolution.cpp +++ b/dnn/test/naive/convolution.cpp @@ -189,7 +189,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { TensorNDArray nchw4_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nchw4_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nchw4_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto workspace_size = conv->get_workspace_in_bytes( @@ -208,7 +208,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; diff --git a/dnn/test/naive/elemwise_multi_type.cpp b/dnn/test/naive/elemwise_multi_type.cpp index 614fd4e7..423efdac 100644 --- a/dnn/test/naive/elemwise_multi_type.cpp +++ b/dnn/test/naive/elemwise_multi_type.cpp @@ -116,7 +116,7 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_UNARY) { typecvt->exec(float_tensors[1], tensors[1]); for (auto&& tensor : float_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; @@ -208,7 +208,7 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_BINARY) { typecvt->exec(float_tensors[2], tensors[2]); for (auto&& tensor : float_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; @@ -268,7 +268,7 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_TERNARY) { typecvt->exec(float_tensors[3], tensors[3]); for (auto&& tensor : float_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; diff --git a/dnn/test/naive/matrix_mul.cpp b/dnn/test/naive/matrix_mul.cpp index e5cc1d8d..df8d41b8 100644 --- a/dnn/test/naive/matrix_mul.cpp +++ b/dnn/test/naive/matrix_mul.cpp @@ -105,7 +105,7 @@ void run_matmul_mk_format( TensorNDArray default_tensors, mk4_tensors; for (size_t i = 0; i < 3; i++) { default_tensors.emplace_back(wb.get(i), default_layouts[i]); - mk4_tensors.emplace_back(tensors[i].raw_ptr, mk4_layouts[i]); + mk4_tensors.emplace_back(tensors[i].raw_ptr(), mk4_layouts[i]); } relayout_opr->exec(mk4_tensors[0], default_tensors[0]); relayout_opr->exec(mk4_tensors[1], default_tensors[1]); @@ -158,9 +158,9 @@ TEST_F(NAIVE, MATRIX_MUL_QUANTIZED4x4x32) { const std::vector& values) { TensorND tensor; tensor.layout = {shape, dtype}; - tensor.raw_ptr = - static_cast(malloc(tensor.layout.span().dist_byte())); - uint8_t* ptr = static_cast(tensor.raw_ptr); + tensor.reset_ptr( + static_cast(malloc(tensor.layout.span().dist_byte()))); + uint8_t* ptr = static_cast(tensor.raw_ptr()); megdnn_assert(values.size() == tensor.layout.span().dist_elem()); for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) { int val0 = values[i], val1 = values[i + 1]; @@ -209,9 +209,9 @@ TEST_F(NAIVE, MATRIX_MUL_QUANTIZEDS4_4x4x16) { const std::vector& values) { TensorND tensor; tensor.layout = {shape, dtype}; - tensor.raw_ptr = - static_cast(malloc(tensor.layout.span().dist_byte())); - uint8_t* ptr = static_cast(tensor.raw_ptr); + tensor.reset_ptr( + static_cast(malloc(tensor.layout.span().dist_byte()))); + uint8_t* ptr = static_cast(tensor.raw_ptr()); megdnn_assert(values.size() == tensor.layout.span().dist_elem()); for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) { int val0 = values[i], val1 = values[i + 1]; diff --git a/dnn/test/naive/mesh_indexing.cpp b/dnn/test/naive/mesh_indexing.cpp index b1f5f6ce..7e39c351 100644 --- a/dnn/test/naive/mesh_indexing.cpp +++ b/dnn/test/naive/mesh_indexing.cpp @@ -67,7 +67,7 @@ TEST_F(NAIVE, BATCHED_MESH_INDEXING) { if (i < 2) { layout.add_axis_cont_inplace(0); } - void* ptr = static_cast(tensor.raw_ptr) + + void* ptr = static_cast(tensor.raw_ptr()) + tensor.layout.stride[0] * n * tensor.layout.dtype.size(); new_tensors.emplace_back(ptr, layout); } @@ -161,7 +161,7 @@ TEST_F(NAIVE, BATCHED_MESH_MODIFY_INCREMENT) { if (i < 2) { layout.add_axis_cont_inplace(0); } - void* ptr = static_cast(tensor.raw_ptr) + + void* ptr = static_cast(tensor.raw_ptr()) + tensor.layout.dtype.size(tensor.layout.stride[0] * n); new_tensors.emplace_back(ptr, layout); } diff --git a/dnn/test/naive/record1.cpp b/dnn/test/naive/record1.cpp new file mode 100644 index 00000000..33f7d1b0 --- /dev/null +++ b/dnn/test/naive/record1.cpp @@ -0,0 +1,1315 @@ +/** + * \file test/naive/record1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "test/naive/fixture.h" + +#include "megdnn/oprs.h" +#include "test/common/task_record_check.h" + +#include "test/common/adaptive_pooling.h" +#include "test/common/cond_take.h" +#include "test/common/convolution3d.h" +#include "test/common/local.h" +#include "test/common/matrix_mul.h" +#include "test/common/rng.h" +#include "test/common/separable_conv.h" +#include "test/common/warp_affine.h" +#include "test/common/warp_perspective.h" + +namespace { +using namespace megdnn; +using namespace test; + +class ArgmxxRNG final : public RNG { +public: + void gen(const TensorND& tensor) override { + auto offset = tensor.layout.span().low_elem; + auto nr_elems = tensor.layout.span().dist_elem(); + +#define cb(DType) \ + if (tensor.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + auto ptr = tensor.ptr(); \ + for (size_t i = 0; i < nr_elems; ++i) { \ + ptr[offset + i] = i; \ + } \ + COMPAT_RANDOM(ptr + offset, ptr + offset + nr_elems); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb); +#undef cb + } +}; + +template +void test_argmxx() { + TaskRecordChecker checker(2); + checker.set_dtype(1, dtype::Int32()); + using Param = typename Argmxx::Param; + ArgmxxRNG rng; + checker.set_rng(0, &rng); + for (size_t axis = 0; axis < 4; ++axis) { + Param param; + param.axis = axis; + checker.set_param(param) + .set_dtype(0, dtype::Float32()) + .execs({{2, 3, 4, 5}, {}}); + checker.set_param(param) + .set_dtype(0, dtype::Float16()) + .execs({{2, 3, 4, 5}, {}}); + checker.set_param(param).set_dtype(0, dtype::Int32()).execs({{2, 3, 4, 5}, {}}); + checker.set_param(param).set_dtype(0, dtype::Int16()).execs({{2, 3, 4, 5}, {}}); + checker.set_param(param).set_dtype(0, dtype::Int8()).execs({{2, 3, 4, 5}, {}}); + checker.set_param(param).set_dtype(0, dtype::Uint8()).execs({{2, 3, 4, 5}, {}}); + } + checker.set_dtype(0, dtype::Float32()); + Param param; + param.axis = 1; + checker.set_param(param); + // 1-step + checker.execs({{2, 64, 32}, {}}); + // 2-step + checker.execs({{2, 192, 32}, {}}); + // 3-step + checker.execs({{2, 4333, 32}, {}}); + // single reduce + checker.execs({{2, 1, 1}, {}}); + checker.execs({{2, 1 + 1, 1}, {}}); + checker.execs({{2, 2048 + 1, 1}, {}}); + checker.execs({{2, 2048 * 2048 + 1, 1}, {}}); + checker.execs({{2, 1 + 1, 31}, {}}); + checker.execs({{2, 16 + 1, 31}, {}}); + checker.execs({{2, 16 * 16 + 1, 31}, {}}); + checker.execs({{2, 16 * 16 * 16 + 1, 31}, {}}); + checker.execs({{2, 16 * 16 * 16 * 16 + 1, 31}, {}}); + checker.execs({{3, 256 * 256 + 1, 2}, {}}); + checker.execs({{3, 128 * 128 + 1, 3}, {}}); + checker.execs({{3, 64 * 64 + 1, 7}, {}}); + checker.execs({{3, 32 * 32 + 1, 15}, {}}); + checker.execs({{3, 512, 500}, {}}); + // very large reduce + checker.execs({{1, 4194304, 1}, {}}); +} + +class ArgsortRNG final : public RNG { + bool m_rev_order = false; + DType m_dtype; + + template + void fill(T* ptr, int n) { + if (m_rev_order) { + for (int i = 0; i < n; ++i) + ptr[i] = static_cast(n / 2 - i); + } else { + for (int i = 0; i < n; ++i) + ptr[i] = static_cast(i - n / 2); + COMPAT_RANDOM(ptr, ptr + n); + } + } + + void gen(const TensorND& tensor) override { + auto n = tensor.layout.total_nr_elems(); + if (m_dtype == dtype::Float32{}) { + fill(tensor.ptr(), n); + } else { + megdnn_assert(m_dtype == dtype::Int32{}); + fill(tensor.ptr(), n); + } + } + +public: + ArgsortRNG(DType dt) : m_dtype{dt} {} + + void set_rev_order(bool flag) { m_rev_order = flag; } +}; + +void run_forward_test(DType dtype) { + TaskRecordChecker checker(2); + using Param = Argsort::Param; + using Order = Param::Order; + ArgsortRNG rng{dtype}; + checker.set_dtype(2, dtype::Int32()); + checker.set_dtype(0, dtype).set_rng(0, &rng); + for (size_t i = 3; i < 10240; i *= 2) { + Param param; + + param.order = Order::ASCENDING; + checker.set_param(param).execs({{3, i + 1}, {}, {}}); + param.order = Order::DESCENDING; + checker.set_param(param).execs({{3, i - 1}, {}, {}}); + checker.set_param(param).execs({{13, i + 3}, {}, {}}); + } + { + // reverse sort large array + constexpr size_t N = 200003; + rng.set_rev_order(true); + Param param; + param.order = Order::ASCENDING; + checker.set_param(param).execs({{1, N}, {}, {}}); + } +} + +class IdxRng final : public RNG { + void gen(const TensorND& tensor) override { + auto ptr = tensor.ptr(); + auto m = tensor.layout[0], n = tensor.layout[1]; + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + ptr[j] = j; + } + COMPAT_RANDOM(ptr, ptr + n); + ptr += n; + } + } +}; + +void run_backward_test(DType dtype) { + IdxRng rng; + TaskRecordChecker checker(2); + checker.set_dtype(1, dtype::Int32()).set_rng(1, &rng); + checker.set_dtype(0, dtype); + checker.set_dtype(2, dtype); + for (size_t i = 16; i < 4096; i *= 2) { + checker.execs({{3, i}, {3, i}, {3, i}}); + checker.execs({{3, i + 3}, {3, i + 3}, {3, i + 3}}); + checker.execs({{3, i + 3}, {3, i + 3}, {3, i + 7}}); + } +} + +} // anonymous namespace + +namespace megdnn { +namespace test { + +//! adaptive pooling +TEST_F(NAIVE, ADAPTIVE_POOLING_FORWARD_RECORD) { + TaskRecordChecker checker(2); + auto args = adaptive_pooling::get_args(); + using Format = param::AdaptivePooling::Format; + DType dtype = dtype::Float32(); + for (auto&& arg : args) { + auto param = arg.param; + auto src = arg.ishape; + auto dst = arg.oshape; + param.format = Format::NCHW; + checker.set_epsilon(1e-2); + checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec( + TensorShapeArray{src, dst, {}}); + break; + } +} + +TEST_F(NAIVE, ADAPTIVE_POOLING_BACKWARD_RECORD) { + TaskRecordChecker checker(2); + auto args = adaptive_pooling::get_args(); + for (auto&& arg : args) { + TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32()); + TensorLayout olayout = TensorLayout(arg.oshape, dtype::Float32()); + DType dtype = dtype::Float32(); + checker.set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype) + .set_dtype(3, dtype) + .set_param(arg.param) + .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout}); + break; + } +} + +//! add update +TEST_F(NAIVE, ADD_UPDATE_RECORD) { + TaskRecordChecker checker(2); + param::AddUpdate p{2, -1, 3}; + checker.set_param(p) + .set_dtype(0, dtype::BFloat16()) + .set_dtype(1, dtype::BFloat16()) + .execs({{2, 2, 3}, {2, 2, 3}}); +} + +//! argxx +TEST_F(NAIVE, ARGXX_RECORD) { + test_argmxx(); + test_argmxx(); +} + +//! argsort +TEST_F(NAIVE, ARGSORT_FORWARD_RECORD) { + run_forward_test(dtype::Float32{}); + run_forward_test(dtype::Int32{}); +} + +TEST_F(NAIVE, ARGSORT_BACKWARD_RECORD) { + run_backward_test(dtype::Float32{}); + run_backward_test(dtype::Int32{}); +} + +TEST_F(NAIVE, BATCH_CONV_BIAS_QS8_RECORD) { + TaskRecordChecker checker(2); + UniformIntRNG const_rng{1, 1}; + UniformIntRNG rng{-5, 5}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{1.1f}) + .set_dtype(4, dtype::QuantizedS8{1.1f}) + .set_epsilon(1 + 1e-3); + param::BatchConvBias param; + param.pad_h = 2, param.pad_w = 1; + param.stride_h = 1, param.stride_w = 2; + param.format = param::BatchConvBias::Format::NCHW4; + checker.set_param(param).execs( + {{32, 4, 24, 24, 4}, {32, 32, 4, 1, 1, 4}, {1, 8, 1, 1, 4}, {}, {}}); +} + +//! batched_matmul +TEST_F(NAIVE, BATCH_MAT_MUL_RECORD) { + TaskRecordChecker checker(2); + using TestArg = matrix_mul::TestArg; + //! return expect if stride == -1, stride otherwise + auto stride_val = [](size_t stride, size_t expect) -> size_t { + if (stride == TestArg::UNSET_STRIDE_VAL) { + return expect; + } else { + return stride; + } + }; + + using Param = MatrixMul::Param; + std::vector args; + args = matrix_mul::get_batched_matmul_args(); + + for (auto& arg : args) { + if (arg.b == 1) { + continue; + } + size_t m = arg.m, n = arg.n, k = arg.k; + + Param param; + param.transposeA = arg.mask & 0x1; + param.transposeB = arg.mask & 0x2; + size_t A0 = m, A1 = k, B0 = k, B1 = n; + TensorShape A, B; + if (param.transposeA) { + std::swap(A0, A1); + } + if (param.transposeB) { + std::swap(B0, B1); + } + ptrdiff_t A_stride = arg.A_stride, B_stride = arg.B_stride, + C_stride = arg.C_stride, A_batch_stride = arg.A_batch_stride, + B_batch_stride = arg.B_batch_stride, + C_batch_stride = arg.C_batch_stride; + A_stride = stride_val(A_stride, A1); + B_stride = stride_val(B_stride, B1); + C_stride = stride_val(C_stride, n); + A_batch_stride = stride_val(A_batch_stride, A0 * A_stride); + B_batch_stride = stride_val(B_batch_stride, B0 * B_stride); + C_batch_stride = stride_val(C_batch_stride, m * C_stride); + + checker.set_param(param); + checker.execl( + {TensorLayout{ + {arg.b, A0, A1}, + {A_batch_stride, A_stride, 1}, + dtype::Float32()}, + TensorLayout{ + {arg.b, B0, B1}, + {B_batch_stride, B_stride, 1}, + dtype::Float32()}, + TensorLayout{ + {arg.b, m, n}, + {C_batch_stride, C_stride, 1}, + dtype::Float32()}}); + break; + } +} + +//! BN +TEST_F(NAIVE, BN_FORWARD_RECORD) { + TaskRecordChecker checker(2); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_epsilon(1e-3); + + param::BN param; + param.fwd_mode = param::BN::FwdMode::TRAINING; + param.param_dim = param::BN::ParamDim::DIM_1C11; + param.epsilon = 1e-3; + + for (size_t n : {1, 2}) { + for (size_t c : {1, 128}) { + for (size_t i : {2, 14}) { + for (float f : {0.5, 1.0}) { + param.avg_factor = f; + checker.set_param(param); + TensorShape src{n, c, i, i}; + TensorShape inp{1, c, 1, 1}; + checker.execs( + {src, //! src -> input + inp, //! bn_scale -> input + inp, //! bn_bias -> input + inp, //! mean -> output + inp, //! variance -> output + inp, //! batch_mean -> output + inp, //! batch_inv_variance -> output + {}, //! reserve -> output + {}}); + } + } + } + } + + UniformFloatRNG rng(1.0f, 2.0f); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_rng(3, &rng) + .set_rng(4, &rng) + .set_epsilon(1e-3); + + param.fwd_mode = param::BN::FwdMode::INFERENCE; + param.param_dim = param::BN::ParamDim::DIM_1C11; + param.epsilon = 1e-3; + checker.set_param(param); + + for (size_t n : {1, 2}) { + for (size_t c : {1, 128}) { + for (size_t i : {2, 14}) { + TensorShape src{n, c, i, i}; + TensorShape inp{1, c, 1, 1}; + checker.exec({ + src, //! src -> input + inp, //! bn_scale -> input + inp, //! bn_bias -> input + inp, //! mean -> input + inp, //! variance -> input + {}, //! batch_mean -> output[unused] + {}, //! batch_inv_variance -> output[unused] + {}, //! reserve -> output + {} //! dst -> output[shape got by + //! deduced] + }); + } + } + } +} + +TEST_F(NAIVE, BN_BACKWARD_RECORD) { + TaskRecordChecker checker(2); + UniformFloatRNG rng(1.0f, 2.0f); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_rng(3, &rng); + + param::BN param; + param.fwd_mode = param::BN::FwdMode::TRAINING; + param.epsilon = 0.0f; + checker.set_param(param); + + for (size_t n : {1, 2}) { + for (size_t c : {3, 128}) { + for (size_t i : {2, 14}) { + TensorShape src{n, c, i, i}; + TensorShape inp{1, c, 1, 1}; + checker.exec({ + src, //! x -> input + src, //! dy -> input + inp, //! bn_mean -> input + inp, //! bn_ivar -> input + inp, //! bn_scale -> input + {}, //! reserve -> input + inp, //! d_bn_scale -> output + inp, //! d_bn_bias -> output + src //! dx -> output + }); + } + } + } +} + +//! concat +TEST_F(NAIVE, CONCAT_RECORD) { + TaskRecordChecker checker(2); + using Param = Concat::Param; + for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) + for (size_t axis = 0; axis < 4; ++axis) { + Param param; + param.axis = axis; + TensorShapeArray shapes(4, TensorShape({2, 3, 4, 5})); + for (size_t i = 0; i < 4; ++i) { + shapes[i].shape[axis] = i + 1; + } + shapes.emplace_back(); + for (size_t i = 0; i < shapes.size(); ++i) + checker.set_dtype(i, dtype); + checker.set_param(param).execs(shapes); + } +} + +//! ConvBias +TEST_F(NAIVE, CONV_BIAS_RECORD) { + TaskRecordChecker checker(2); + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW; + checker.set_dtype(0, dtype::QuantizedS8(0.1f)) + .set_dtype(1, dtype::QuantizedS8(0.2f)) + .set_dtype(2, dtype::QuantizedS32(0.02f)) + .set_dtype(3, dtype::QuantizedS32(0.3f)) + .set_dtype(4, dtype::QuantizedS32(0.02f)); + checker.set_param(param).execs( + {{1, 1, 4, 4}, {3, 1, 3, 3}, {1, 3, 1, 1}, {1, 3, 2, 2}, {}}); +} + +//! Convolution +TEST_F(NAIVE, CONV_RECORD) { + TaskRecordChecker checker(2); + Convolution::Param param; + param.format = Convolution::Param::Format::NCHW; + checker.set_param(param).execs({{1, 1, 4, 4}, {3, 1, 3, 3}, {}}); +} + +//! Conv3D +TEST_F(NAIVE, CONV3D_RECORD) { + using TestArg = convolution3d::TestArg; + std::vector args = convolution3d::get_args(); + TaskRecordChecker checker(2); + NormalRNG default_rng; + for (auto&& arg : args) { + float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3] * + arg.filter[4]); + UniformFloatRNG rng(scale, 2 * scale); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); + } +} + +//! cumsum +TEST_F(NAIVE, CUMSUM_RECORD) { + TaskRecordChecker checker(2); + struct TestArg { + param::Cumsum param; + TensorShape shape; + TestArg(param::Cumsum param, TensorShape shape) : param(param), shape(shape) {} + }; + std::vector args, args_int32; + for (auto shape : TensorShapeArray{{1000}, {330, 33}, {10, 10, 10}, {5, 5, 5, 5}}) { + for (size_t axis = 0; axis < shape.ndim; ++axis) { + args.emplace_back(param::Cumsum(axis, true, true), shape); + args.emplace_back(param::Cumsum(axis, true, false), shape); + args.emplace_back(param::Cumsum(axis, false, true), shape); + args.emplace_back(param::Cumsum(axis, false, false), shape); + } + } + for (auto shape : TensorShapeArray{{1}, {10}, {100}, {1000}, {10000}}) { + args.emplace_back(param::Cumsum(0, true, true), shape); + args.emplace_back(param::Cumsum(0, true, false), shape); + args.emplace_back(param::Cumsum(0, false, true), shape); + args.emplace_back(param::Cumsum(0, false, false), shape); + } + for (auto shape : TensorShapeArray{{1}, {10}, {100}, {1000}, {10000}}) { + args_int32.emplace_back(param::Cumsum(0, true, true), shape); + args_int32.emplace_back(param::Cumsum(0, true, false), shape); + args_int32.emplace_back(param::Cumsum(0, false, true), shape); + args_int32.emplace_back(param::Cumsum(0, false, false), shape); + } + for (auto arg : args) { + checker.set_param(arg.param); + checker.set_epsilon(1e-2); + checker.set_dtype(0, dtype::Float32()).execs({{arg.shape}, {}}); + checker.set_dtype(0, dtype::Int16()).execs({{arg.shape}, {}}); + checker.set_dtype(0, dtype::Int32()).execs({{arg.shape}, {}}); + } + for (auto arg : args_int32) { + checker.set_param(arg.param); + checker.set_epsilon(1e-2); + checker.set_dtype(0, dtype::Int32()).execs({{arg.shape}, {}}); + } +} + +//! dct +TEST_F(NAIVE, DCT_RECORD) { + TaskRecordChecker checker(2); + DctChannelSelectForward::Param param; + param.format = DctChannelSelectForward::Param::Format::NCHW4; + checker.set_dtype(0, dtype::Uint8()).set_dtype(3, dtype::QuantizedS8(10.f)); + checker.set_param(param).execs({{1, 1, 16, 16}, {}, {}, {}}); +} + +//! deformable_conv +TEST_F(NAIVE, DEFORMABLE_CONV_FWD_RECORD) { + TaskRecordChecker checker(2); + DeformableConv::Param param; + + UniformIntRNG im_rng{0, 4}; + UniformIntRNG filter_rng{0, 4}; + UniformIntRNG offset_rng{-2, 2}; + UniformIntRNG mask_rng{0, 1}; + + checker.set_rng(0, &im_rng) + .set_rng(1, &filter_rng) + .set_rng(2, &offset_rng) + .set_rng(3, &mask_rng); + + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.dilate_h = 1; + param.dilate_w = 1; + param.format = DeformableConv::Param::Format::NCHW; + param.sparse = DeformableConv::Param::Sparse::GROUP; + + checker.set_param(param).execs( + {{1, 2, 5, 5}, + {2, 1, 1, 3, 3}, + {1, 2 * 2 * 3 * 3, 5, 5}, + {1, 2 * 3 * 3, 5, 5}, + {}}); + + checker.set_param(param).execs( + {{1, 2, 5, 5}, + {2, 1, 1, 3, 3}, + {1, 2 * 2 * 3 * 3, 5, 5}, + {1, 2 * 3 * 3, 5, 5}, + {}}); + + param.sparse = DeformableConv::Param::Sparse::DENSE; + checker.set_param(param).execs( + {{1, 2, 5, 5}, + {2, 2, 3, 3}, + {1, 2 * 2 * 3 * 3, 5, 5}, + {1, 2 * 3 * 3, 5, 5}, + {}}); +} + +TEST_F(NAIVE, DEFORMABLE_CONV_BWD_FILTER_RECORD) { + TaskRecordChecker checker(2); + DeformableConv::Param param; + + UniformIntRNG im_rng{0, 4}; + UniformIntRNG offset_rng{-2, 2}; + UniformIntRNG mask_rng{0, 1}; + UniformIntRNG out_grad_rng{0, 1}; + + checker.set_rng(0, &im_rng) + .set_rng(1, &offset_rng) + .set_rng(2, &mask_rng) + .set_rng(3, &out_grad_rng); + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.dilate_h = 1; + param.dilate_w = 1; + param.format = DeformableConv::Param::Format::NCHW; + param.sparse = DeformableConv::Param::Sparse::GROUP; + + checker.set_param(param).execs( + {{1, 2, 5, 5}, + {1, 2 * 2 * 3 * 3, 5, 5}, + {1, 2 * 3 * 3, 5, 5}, + {1, 2, 5, 5}, + {2, 1, 1, 3, 3}}); +} + +TEST_F(NAIVE, DEFORMABLE_CONV_BWD_DATA_RECORD) { + TaskRecordChecker checker(2); + DeformableConv::Param param; + + ConstValue im_rng{1}; + ConstValue filter_rng{0.99}; + ConstValue offset_rng{1.1}; + ConstValue mask_rng{1}; + ConstValue out_grad_rng{1}; + + checker.set_rng(0, &im_rng) + .set_rng(1, &filter_rng) + .set_rng(2, &offset_rng) + .set_rng(3, &mask_rng) + .set_rng(4, &out_grad_rng); + + param.pad_h = 1; + param.pad_w = 1; + param.stride_h = 1; + param.stride_w = 1; + param.dilate_h = 1; + param.dilate_w = 1; + param.format = DeformableConv::Param::Format::NCHW; + param.sparse = DeformableConv::Param::Sparse::GROUP; + + checker.set_param(param).execs( + {{1, 2, 5, 5}, + {2, 1, 1, 3, 3}, + {1, 1 * 2 * 3 * 3, 5, 5}, + {1, 1 * 3 * 3, 5, 5}, + {1, 2, 5, 5}, + {1, 2, 5, 5}, + {1, 1 * 2 * 3 * 3, 5, 5}, + {1, 1 * 3 * 3, 5, 5}}); +} + +//! elemwise +TEST_F(NAIVE, ELEMWISE_COMMON_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_activate = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, + DType dtype) { + checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype); + checker.execs({{N, C, H, W}, {}}); + }; + auto run_binary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, + DType dtype) { + checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype( + 2, dtype); + checker.execs({{N, C, H, W}, {N, C, H, W}, {}}); + }; + auto run_unary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, + DType dtype) { + checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype); + checker.execs({{N, C, H, W}, {}}); + }; + +#define RUN_ACTIVATE(_dt) \ + run_activate(4, 32, 10, 10, Mode::RELU, _dt); \ + run_activate(4, 32, 10, 10, Mode::SIGMOID, _dt); + RUN_ACTIVATE(dtype::Float32()); + RUN_ACTIVATE(dtype::Float16()); + checker.set_epsilon(1e-2); + RUN_ACTIVATE(dtype::BFloat16()); +#undef RUN_ACTIVATE + + checker.set_epsilon(1e-3); + +#define RUN_BINARY(_dt) \ + run_binary(4, 32, 10, 10, Mode::ADD, _dt); \ + run_binary(4, 32, 10, 10, Mode::SUB, _dt); \ + run_binary(4, 32, 10, 10, Mode::MUL, _dt); \ + run_binary(4, 32, 10, 10, Mode::MIN, _dt); \ + run_binary(4, 32, 10, 10, Mode::MAX, _dt); + RUN_BINARY(dtype::Float32()); + RUN_BINARY(dtype::Float16()); + RUN_BINARY(dtype::BFloat16()); + RUN_BINARY(dtype::Int32()); + RUN_BINARY(dtype::Int16()); + + //! true_div + run_binary(4, 32, 10, 10, Mode::TRUE_DIV, dtype::Float32()); + RUN_BINARY(dtype::Float16()); + checker.set_epsilon(1e-2); + run_binary(4, 32, 10, 10, Mode::TRUE_DIV, dtype::Float16()); + RUN_BINARY(dtype::BFloat16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_binary(4, 32, 10, 10, Mode::TRUE_DIV, dtype::BFloat16()); +#undef RUN_BINARY + +#define RUN_UNARY(_dt) \ + run_unary(4, 32, 10, 10, Mode::ABS, _dt); \ + run_unary(4, 32, 10, 10, Mode::SIN, _dt); \ + run_unary(4, 32, 10, 10, Mode::COS, _dt); \ + run_unary(4, 32, 10, 10, Mode::EXP, _dt); \ + run_unary(4, 32, 10, 10, Mode::CEIL, _dt); \ + run_unary(4, 32, 10, 10, Mode::TANH, _dt); + RUN_UNARY(dtype::Float32()); + RUN_UNARY(dtype::BFloat16()); + checker.set_epsilon(1e-2); + RUN_UNARY(dtype::Float16()); + + //! FLOOR + run_unary(4, 32, 10, 10, Mode::FLOOR, dtype::Float32()); + run_unary(4, 32, 10, 10, Mode::FLOOR, dtype::Float16()); + + //! INT TEST + run_unary(4, 32, 10, 10, Mode::ABS, dtype::Int16()); + run_unary(4, 32, 10, 10, Mode::ABS, dtype::Int32()); +#undef RUN_UNARY + + //! naive impl + run_binary(4, 32, 10, 10, Mode::LT, dtype::Float32()); + run_binary(4, 32, 10, 10, Mode::LT, dtype::Int32()); + + run_binary(4, 32, 10, 10, Mode::LEQ, dtype::Float32()); + run_binary(4, 32, 10, 10, Mode::LEQ, dtype::Int32()); + + run_binary(4, 32, 10, 10, Mode::EQ, dtype::Float32()); + run_binary(4, 32, 10, 10, Mode::EQ, dtype::Int32()); + + auto rng = UniformFloatRNG(0.01, 2.0); + checker.set_rng(0, &rng); + + run_unary(4, 32, 10, 10, Mode::LOG, dtype::Float32()); + run_unary(4, 32, 10, 10, Mode::LOG, dtype::BFloat16()); + checker.set_epsilon(1e-2); + run_unary(4, 32, 10, 10, Mode::LOG, dtype::Float16()); + + run_unary(4, 32, 10, 10, Mode::NEGATE, dtype::Float32()); + run_unary(4, 32, 10, 10, Mode::NEGATE, dtype::BFloat16()); + run_unary(4, 32, 10, 10, Mode::NEGATE, dtype::Float16()); + + auto rng_int = UniformIntNonZeroRNG(1, 65535); + checker.set_rng(0, &rng_int); + run_unary(4, 32, 10, 10, Mode::NEGATE, dtype::Int32()); + run_unary(4, 32, 10, 10, Mode::NEGATE, dtype::Int16()); +} + +TEST_F(NAIVE, ELEMWISE_BROADCAST_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + //! do broadcast test + auto run_binary_broadcast = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, + DType dtype) { + checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype); + checker.execs({{N, C, H, W}, {N, C, 1, 1}, {}}); + checker.execs({{N, C, 1, 1}, {N, C, H, W}, {}}); + checker.execs({{N, C, H, W}, {1}, {}}); + checker.execs({{1}, {N, C, H, W}, {}}); + checker.execs({{N, C, H, W}, {1, C, H, W}, {}}); + checker.execs({{1, C, H, W}, {N, C, H, W}, {}}); + }; +#define RUN_BINARY(_dt) \ + run_binary_broadcast(4, 32, 10, 10, Mode::ADD, _dt); \ + run_binary_broadcast(4, 32, 10, 10, Mode::SUB, _dt); \ + run_binary_broadcast(4, 32, 10, 10, Mode::MUL, _dt); \ + run_binary_broadcast(4, 32, 10, 10, Mode::MIN, _dt); \ + run_binary_broadcast(4, 32, 10, 10, Mode::MAX, _dt); + RUN_BINARY(dtype::Float32()); + run_binary_broadcast(4, 32, 10, 10, Mode::TRUE_DIV, dtype::Float32()); + RUN_BINARY(dtype::Float16()); + checker.set_epsilon(1e-2); + run_binary_broadcast(4, 32, 10, 10, Mode::TRUE_DIV, dtype::Float16()); + RUN_BINARY(dtype::BFloat16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_binary_broadcast(4, 32, 10, 10, Mode::TRUE_DIV, dtype::BFloat16()); + RUN_BINARY(dtype::Int16()); + RUN_BINARY(dtype::Int32()); +#undef RUN_BINARY +} + +TEST_F(NAIVE, ELEMWISE_FUSE_MUL_ADD3_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_mul_add = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { + checker.set_param(Mode::FUSE_MUL_ADD3) + .set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype); + checker.execs({{1}, {N, C, H, W}, {1}, {}}); + checker.execs({{N, C, 1, 1}, {N, C, H, W}, {1}, {}}); + checker.execs({{N, C, H, W}, {N, C, H, W}, {1}, {}}); + checker.execs({{N, C, 1, 1}, {N, C, H, W}, {N, C, 1, 1}, {}}); + }; + run_mul_add(4, 32, 10, 10, dtype::Float32()); + checker.set_epsilon(1e-2); + run_mul_add(4, 32, 10, 10, dtype::Float16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_mul_add(4, 32, 10, 10, dtype::BFloat16()); + run_mul_add(4, 32, 10, 10, dtype::Int16()); + run_mul_add(4, 32, 10, 10, dtype::Int32()); +} + +TEST_F(NAIVE, ELEMWISE_FUSE_MUL_ADD4_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_mul_add = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { + checker.set_param(Mode::FUSE_MUL_ADD4) + .set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype) + .set_dtype(3, dtype) + .set_dtype(4, dtype); + checker.execs({{1}, {N, C, H, W}, {1}, {N, C, H, W}, {}}); + checker.execs({{1}, {N, C, H, W}, {N, C, H, W}, {1}, {}}); + checker.execs({{N, C, 1, 1}, {N, C, H, W}, {N, C, 1, 1}, {N, C, H, W}, {}}); + checker.execs({{N, C, H, W}, {N, C, H, W}, {N, C, H, W}, {N, C, H, W}, {}}); + }; + run_mul_add(4, 32, 10, 10, dtype::Float32()); + checker.set_epsilon(1e-2); + run_mul_add(4, 32, 10, 10, dtype::Float16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_mul_add(4, 32, 10, 10, dtype::BFloat16()); + run_mul_add(4, 32, 10, 10, dtype::Int16()); + run_mul_add(4, 32, 10, 10, dtype::Int32()); +} + +TEST_F(NAIVE, ELEMWISE_FUSE_ADD_RELU_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_mul_add = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { + checker.set_param(Mode::FUSE_ADD_RELU) + .set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype); + checker.execs({{N, C, H, W}, {N, C, H, W}, {}}); + }; + run_mul_add(4, 32, 10, 10, dtype::Float32()); + checker.set_epsilon(1e-2); + run_mul_add(4, 32, 10, 10, dtype::Float16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_mul_add(4, 32, 10, 10, dtype::BFloat16()); +} + +TEST_F(NAIVE, ELEMWISE_FUSE_ADD_SIGMOID_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_mul_add = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { + checker.set_param(Mode::FUSE_ADD_SIGMOID) + .set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype); + checker.execs({{N, C, H, W}, {N, C, H, W}, {}}); + }; + run_mul_add(4, 32, 10, 10, dtype::Float32()); + checker.set_epsilon(1e-2); + run_mul_add(4, 32, 10, 10, dtype::Float16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_mul_add(4, 32, 10, 10, dtype::BFloat16()); +} + +TEST_F(NAIVE, ELEMWISE_FUSE_ADD_TANH_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_mul_add = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { + checker.set_param(Mode::FUSE_ADD_TANH) + .set_dtype(0, dtype) + .set_dtype(1, dtype) + .set_dtype(2, dtype); + checker.execs({{N, C, H, W}, {N, C, H, W}, {}}); + }; + run_mul_add(4, 32, 10, 10, dtype::Float32()); + checker.set_epsilon(1e-2); + run_mul_add(4, 32, 10, 10, dtype::Float16()); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_mul_add(4, 32, 10, 10, dtype::BFloat16()); +} + +TEST_F(NAIVE, ELEMWISE_VECTOR_RECORD) { + TaskRecordChecker checker(2); + using Mode = ElemwiseForward::Param::Mode; + auto run_vector = [&](size_t N, DType dtype, Mode mode) { + checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype( + 2, dtype); + checker.execs({{N}, {1, N}, {}}); + checker.execs({{1, N}, {N}, {}}); + checker.execs({{N}, {1}, {}}); + checker.execs({{1}, {N}, {}}); + checker.execs({{1}, {1, 1}, {}}); + checker.execs({{1, 1, 1}, {1}, {}}); + }; + run_vector(1000, dtype::Float32(), Mode::ADD); + run_vector(1000, dtype::Float32(), Mode::MUL); + checker.set_epsilon(1e-2); + run_vector(1000, dtype::Float16(), Mode::ADD); + run_vector(1000, dtype::Float16(), Mode::MUL); + //! FIXME: precision is especially low + checker.set_epsilon(1e-1); + run_vector(1000, dtype::BFloat16(), Mode::ADD); + run_vector(1000, dtype::BFloat16(), Mode::MUL); +} + +//! EYE +TEST_F(NAIVE, EYE_RECORD) { + TaskRecordChecker checker(2); + for (DType dtype : + std::vector{dtype::Float16(), dtype::Int32(), dtype::Float32()}) + for (int k = -20; k < 20; ++k) { + checker.set_param({k, dtype.enumv()}); + checker.set_dtype(0, dtype); + checker.execs(TensorShapeArray{{3, 4}}); + checker.execs(TensorShapeArray{{4, 3}}); + } +} + +//! FILL +TEST_F(NAIVE, FILL_RECORD) { + TaskRecordChecker checker(2); + for (DType dtype : + std::vector{dtype::Float16(), dtype::Int32(), dtype::Float32()}) + for (float value : {-1.23, 0.0, 0.001, 234.0, 2021.072}) { + checker.set_param({value}); + checker.set_dtype(0, dtype); + checker.exec(TensorShapeArray{{1, 1}}); + checker.exec(TensorShapeArray{{2, 3, 4}}); + } +} + +//! LINSPACE +TEST_F(NAIVE, LINSPACE_RECORD) { + TaskRecordChecker checker(2); + Linspace::Param param; + param.start = 0.5; + param.stop = 1.5; + param.endpoint = true; + for (DType dtype : + std::vector{dtype::Float16(), dtype::Int32(), dtype::Float32()}) { + checker.set_dtype(0, dtype).set_param(param).exec(TensorShapeArray{{11}}); + } + param.endpoint = false; + for (DType dtype : + std::vector{dtype::Float16(), dtype::Int32(), dtype::Float32()}) { + checker.set_dtype(0, dtype).set_param(param).exec(TensorShapeArray{{11}}); + } +} + +//! LOCAL +TEST_F(NAIVE, LOCAL_FORWARD_RECORD) { + auto args = local::get_args_for_cuda(); + for (size_t i = 0; i < 2; ++i) { + auto&& arg = args[i]; + TaskRecordChecker checker(2); + checker.set_param(arg.param).exec( + TensorShapeArray{arg.sshape(), arg.fshape(), arg.dshape()}); + } +} + +TEST_F(NAIVE, LOCAL_BACKWARD_DATA_RECORD) { + using namespace local; + auto args = local::get_args_bwd_data_for_cuda(); + for (size_t i = 0; i < 2; ++i) { + auto&& arg = args[i]; + TaskRecordChecker checker(2); + checker.set_param(arg.param).exec( + TensorShapeArray{arg.fshape(), arg.dshape(), arg.sshape()}); + } +} + +TEST_F(NAIVE, LOCAL_BACKWARD_FILTER_RECORD) { + using namespace local; + auto args = local::get_args_bwd_filter_for_cuda(); + for (size_t i = 0; i < 2; ++i) { + auto&& arg = args[i]; + TaskRecordChecker checker(2); + checker.set_param(arg.param).exec( + TensorShapeArray{arg.sshape(), arg.dshape(), arg.fshape()}); + } +} + +//! matrix inverse +TEST_F(NAIVE, MATRIX_INVERSE_RECORD) { + TaskRecordChecker checker(2); + checker.exec({{10, 20, 20}, {}}); +} + +//! matmul +TEST_F(NAIVE, MATRIX_MUL_RECORD) { + TaskRecordChecker checker(2); + MatrixMul::Param param; + param.transposeA = false; + param.transposeB = false; + + checker.set_dtype(0, dtype::Quantized8Asymm(0.1f, (uint8_t)128)) + .set_dtype(1, dtype::Quantized8Asymm(0.2f, (uint8_t)233)) + .set_dtype(2, dtype::QuantizedS32(0.1f * 0.2f)); + checker.set_param(param).exec({{4, 7}, {7, 5}, {}}); + + param.transposeA = true; + checker.set_dtype(0, dtype::Quantized8Asymm(0.7f, (uint8_t)128)) + .set_dtype(1, dtype::Quantized8Asymm(0.4f, (uint8_t)128)) + .set_dtype(2, dtype::QuantizedS32(0.7f * 0.4f)); + checker.set_param(param).exec({{2, 1}, {2, 1}, {}}); +} + +//! pooling +TEST_F(NAIVE, POOLING_QUANTIZED_RECORD) { + using Mode = Pooling::Param::Mode; + + TaskRecordChecker checker(2); + Pooling::Param param{Mode::MAX, 1, 1, 2, 2, 2, 2}; + auto dt = dtype::Quantized8Asymm(0.1f, (uint8_t)128); + checker.set_dtype(0, dt).set_dtype(1, dt); + checker.set_param(param).exec({{1, 1, 3, 3}, {}}); + + param = {Mode::AVERAGE, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exec({{1, 1, 3, 3}, {}}); + + param = {Mode::AVERAGE_COUNT_EXCLUDE_PADDING, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exec({{1, 1, 3, 3}, {}}); + + auto dt32 = dtype::QuantizedS32(0.233f); + checker.set_dtype(0, dt32).set_dtype(1, dt32); + param = {Mode::MAX, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exec({{1, 1, 3, 3}, {}}); +} + +TEST_F(NAIVE, REDUCE_QUANTIZED_RECORD) { + using Mode = Reduce::Param::Mode; + + TaskRecordChecker checker(2); + + Reduce::Param param; + param.mode = Mode::SUM; + param.data_type = param::Reduce::DataType::QUINT_I8xO32; + param.axis = 0; + + checker.set_dtype(0, dtype::Quantized8Asymm(0.1f, (uint8_t)128)) + .set_dtype(1, dtype::QuantizedS32(0.1f)); + checker.set_param(param).exec({{3, 4}, {}}); + + param.data_type = param::Reduce::DataType::DEFAULT; + param.mode = Mode::MEAN; + checker.set_dtype(0, dtype::Quantized8Asymm(1.f, (uint8_t)128)) + .set_dtype(1, dtype::Quantized8Asymm(1.f, (uint8_t)128)); + checker.set_param(param).exec({{3, 4}, {}}); + + checker.set_dtype(0, dtype::Quantized8Asymm(0.00233f, (uint8_t)128)) + .set_dtype(1, dtype::Quantized8Asymm(0.00233f, (uint8_t)128)); + checker.set_param(param).exec({{3, 4}, {}}); + + checker.set_dtype(0, dtype::Quantized8Asymm(7e-10f, (uint8_t)45)) + .set_dtype(1, dtype::Quantized8Asymm(7e-10f, (uint8_t)45)); + checker.set_param(param).exec({{3, 4}, {}}); +} + +//! relayout format +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW4_NCHW_RECORD) { + TaskRecordChecker checker(2); + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW4_NCHW}; + checker.set_param(param).exec({{1, 2, 1, 2, 4}, {}}); + + param.oc = 7; + checker.set_param(param).exec({{1, 2, 1, 2, 4}, {}}); + + param.oc = 6; + param.group = 2; + checker.set_param(param).exec({{1, 2, 1, 2, 4}, {}}); +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT_RECORD) { + TaskRecordChecker checker(2); + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT}; + checker.set_param(param); + checker.exec({{2, 2, 2, 2}, {}}); + checker.exec({{2, 2, 1, 2, 2}, {}}); +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW_NCHW4_RECORD) { + TaskRecordChecker checker(2); + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW4}; + checker.set_param(param).exec({{1, 8, 1, 2}, {}}); + + param.group = 4; + checker.set_param(param).exec({{1, 8, 1, 2}, {}}); + + param.group = 2; + checker.set_param(param).exec({{1, 6, 1, 2}, {}}); +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88_RECORD) { + TaskRecordChecker checker(2); + { + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW88}; + checker.set_param(param); + checker.exec({{1, 8, 1, 2}, {}}); + checker.exec({{2, 8, 1, 2}, {}}); + checker.exec({{2, 4, 1, 2}, {}}); + checker.exec({{1, 3, 64, 64}, {}}); + } + { + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW88_NCHW}; + checker.set_param(param).exec({{1, 1, 1, 2, 8}, {}}); + } +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88_DENSE_RECORD) { + TaskRecordChecker checker(2); + RelayoutFormat::Param param{ + RelayoutFormat::Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT}; + checker.set_param(param); + checker.exec({{8, 8, 1, 1}, {}}); + checker.exec({{8, 2, 1, 1}, {}}); +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88_CHAIN_RECORD) { + TaskRecordChecker checker(2); + RelayoutFormat::Param param{ + RelayoutFormat::Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT}; + checker.set_param(param); + checker.exec({{8, 1, 1, 1, 2}, {}}); + checker.exec({{2, 1, 1, 1, 2}, {}}); +} + +TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88_GROUP_RECORD) { + TaskRecordChecker checker(2); + { + RelayoutFormat::Param param{ + RelayoutFormat::Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT}; + checker.set_param(param); + checker.exec({{1, 8, 8, 1, 1}, {}}); + checker.exec({{1, 8, 2, 1, 1}, {}}); + } + { + RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW88_NCHW}; + checker.set_param(param).exec({TensorShape{1, 8, 64, 64, 8}, {}}); + } +} + +//! separable conv +TEST_F(NAIVE, SEPARABLE_CONV_RECORD) { + using TestArg = megdnn::test::separable_conv::TestArg; + std::vector args = separable_conv::get_args(); + TaskRecordChecker checker(2); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); + } +} + +//! warp affine +TEST_F(NAIVE, WARP_AFFINE_RECORD) { + TaskRecordChecker checker(2); + WarpAffine::Param param; + param.border_mode = WarpAffine::Param::BorderMode::BORDER_REFLECT; + param.imode = WarpAffine::Param::InterpolationMode::LINEAR; + param.format = WarpAffine::Param::Format::NCHW; + + checker.set_dtype(0, dtype::Uint8{}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Uint8{}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 2, 3}, {1, 1, 2, 2}}); + + checker.set_dtype(0, dtype::Quantized8Asymm{1.4f, static_cast(127)}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Quantized8Asymm{1.4f, static_cast(127)}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 2, 3}, {1, 1, 2, 2}}); +} + +TEST_F(NAIVE, WARP_AFFINE_CV_RECORD) { + using TestArg = warp_affine::TestArg; + std::vector args = warp_affine::get_cv_args(); + TaskRecordChecker checker(2); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Uint8()) + .execs({arg.src, arg.trans, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .execs({arg.src, arg.trans, arg.dst}); + } +} + +//! warp perspective +TEST_F(NAIVE, WARP_PERSPECTIVE_RECORD) { + TaskRecordChecker checker(2); + WarpPerspective::Param param; + param.bmode = WarpPerspective::Param::BorderMode::BORDER_REFLECT; + param.imode = WarpPerspective::Param::InterpolationMode::LINEAR; + param.format = WarpPerspective::Param::Format::NCHW; + + checker.set_dtype(0, dtype::Uint8{}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Uint8{}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 3, 3}, {1, 1, 2, 2}}); + + checker.set_dtype(0, dtype::Quantized8Asymm{1.4f, static_cast(127)}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Quantized8Asymm{1.4f, static_cast(127)}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 3, 3}, {1, 1, 2, 2}}); +} + +TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW4_RECORD) { + using Param = WarpPerspective::Param; + WarpPerspective::Param param; + TaskRecordChecker checker(2); + WarpPerspectiveMatRNG rng; + checker.set_rng(1, &rng); + checker.set_dtype(0, dtype::QuantizedS8(0.1f)); + checker.set_dtype(2, dtype::QuantizedS8(0.1f)); + for (auto bmode : + {WarpPerspective::BorderMode::WRAP, WarpPerspective::BorderMode::REFLECT, + WarpPerspective::BorderMode::REPLICATE, + WarpPerspective::BorderMode::CONSTANT}) { + param.border_val = 0.3f; + param.bmode = bmode; + param.imode = Param::InterpolationMode::LINEAR; + + param.format = Param::Format::NCHW4; + checker.set_param(param); + checker.execs({{2, 1, 10, 11, 4}, {2, 3, 3}, {2, 1, 11, 12, 4}}); + checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 25, 510, 4}}); + checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 51, 51, 4}}); + checker.execs({{1, 25, 51, 51, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); + } +} + +TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_RECORD) { + TaskRecordChecker checker(2); + WarpPerspective::Param param; + param.bmode = WarpPerspective::Param::BorderMode::BORDER_REFLECT; + param.imode = WarpPerspective::Param::InterpolationMode::LINEAR; + param.format = WarpPerspective::Param::Format::NCHW; + + checker.set_dtype(0, dtype::Uint8{}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Uint8{}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 3, 3}, {1, 1, 2, 2}}); + + checker.set_dtype(0, dtype::Quantized8Asymm{1.4f, static_cast(127)}) + .set_dtype(1, dtype::Float32{}) + .set_dtype(2, dtype::Quantized8Asymm{1.4f, static_cast(127)}); + checker.set_param(param).exec({{1, 1, 3, 3}, {1, 3, 3}, {1, 1, 2, 2}}); +} + +TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_NCHW4_RECORD) { + using Param = WarpPerspective::Param; + WarpPerspective::Param param; + TaskRecordChecker checker(2); + WarpPerspectiveMatRNG rng; + checker.set_rng(1, &rng); + checker.set_dtype(0, dtype::QuantizedS8(0.1f)); + checker.set_dtype(2, dtype::QuantizedS8(0.1f)); + for (auto bmode : + {WarpPerspective::BorderMode::WRAP, WarpPerspective::BorderMode::REFLECT, + WarpPerspective::BorderMode::REPLICATE, + WarpPerspective::BorderMode::CONSTANT}) { + param.border_val = 0.3f; + param.bmode = bmode; + param.imode = Param::InterpolationMode::LINEAR; + + param.format = Param::Format::NCHW4; + checker.set_param(param); + checker.execs({{2, 1, 10, 11, 4}, {2, 3, 3}, {2, 1, 11, 12, 4}}); + checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 25, 510, 4}}); + checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 51, 51, 4}}); + checker.execs({{1, 25, 51, 51, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); + } +} + +} // namespace test +} // namespace megdnn + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/test/naive/resize.cpp b/dnn/test/naive/resize.cpp index cc18bb72..1291b1c6 100644 --- a/dnn/test/naive/resize.cpp +++ b/dnn/test/naive/resize.cpp @@ -44,7 +44,7 @@ TEST_F(NAIVE, RESIZE_NCHW4) { TensorNDArray nchw4_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nchw4_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nchw4_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto relayout = handle()->create_operator(); @@ -61,7 +61,7 @@ TEST_F(NAIVE, RESIZE_NCHW4) { free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; checker.set_extra_opr_impl(extra_impl); diff --git a/dnn/test/naive/warp_perspective.cpp b/dnn/test/naive/warp_perspective.cpp index 8fa4e0bc..a5902456 100644 --- a/dnn/test/naive/warp_perspective.cpp +++ b/dnn/test/naive/warp_perspective.cpp @@ -97,7 +97,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW4) { TensorNDArray nchw4_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nchw4_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nchw4_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto workspace_size = warp_perspective->get_workspace_in_bytes( @@ -116,7 +116,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW4) { free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; @@ -143,7 +143,6 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW4) { checker.execs({{1, 25, 25, 510, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 51, 51, 4}}); checker.execs({{1, 25, 51, 51, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); - break; } } @@ -272,7 +271,7 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_NCHW4) { TensorNDArray nchw4_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nchw4_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nchw4_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto workspace_size = warp_perspective->get_workspace_in_bytes( @@ -291,7 +290,7 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_NCHW4) { free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; @@ -318,7 +317,6 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_NCHW4) { checker.execs({{1, 25, 25, 510, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); checker.execs({{1, 25, 25, 25, 4}, {1, 3, 3}, {1, 25, 51, 51, 4}}); checker.execs({{1, 25, 51, 51, 4}, {1, 3, 3}, {1, 25, 25, 25, 4}}); - break; } } @@ -593,7 +591,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { TensorNDArray nchw64_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nchw64_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nchw64_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto workspace_size = warp_perspective->get_workspace_in_bytes( @@ -612,7 +610,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; @@ -675,7 +673,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NHWC) { TensorNDArray nhwc_tensors; for (size_t i = 0; i < tensors.size(); ++i) { auto layout = convert_true_format(nchw_tensors[i].layout); - nhwc_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout)); + nhwc_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout)); } auto workspace_size = warp_perspective->get_workspace_in_bytes( @@ -693,7 +691,7 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NHWC) { relayout->exec(nchw_tensors[2], nhwc_tensors[2]); free(workspace_ptr); for (auto&& tensor : nchw_tensors) { - free(tensor.raw_ptr); + free(tensor.raw_ptr()); } }; diff --git a/dnn/test/rocm/adaptive_pooling.cpp b/dnn/test/rocm/adaptive_pooling.cpp index 478dcfd8..57eeca1a 100644 --- a/dnn/test/rocm/adaptive_pooling.cpp +++ b/dnn/test/rocm/adaptive_pooling.cpp @@ -58,8 +58,8 @@ TEST_F(ROCM, ADAPTIVE_POOLING_BACKWARD) { auto&& tensors_rocm = *tensors_rocm_storage; auto span = tensors_rocm[0].layout.span(); - auto dst = static_cast(tensors_rocm[0].raw_ptr) + span.low_byte; - auto src = static_cast(tensors_orig[0].raw_ptr) + + auto dst = static_cast(tensors_rocm[0].raw_ptr()) + span.low_byte; + auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_rocm(), dst, src, span.dist_byte()); @@ -71,8 +71,9 @@ TEST_F(ROCM, ADAPTIVE_POOLING_BACKWARD) { megdnn_free(handle_rocm(), workspace_rocm); span = tensors_rocm[1].layout.span(); - dst = static_cast(tensors_orig[1].raw_ptr) + span.low_byte; - src = static_cast(tensors_rocm[1].raw_ptr) + span.low_byte; + dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; + src = static_cast(tensors_rocm[1].raw_ptr()) + + span.low_byte; megdnn_memcpy_D2H(handle_rocm(), dst, src, span.dist_byte()); }; diff --git a/dnn/test/rocm/benchmarker.inl b/dnn/test/rocm/benchmarker.inl index 2cd6424b..819e2dfa 100644 --- a/dnn/test/rocm/benchmarker.inl +++ b/dnn/test/rocm/benchmarker.inl @@ -52,9 +52,9 @@ float ROCMBenchmarker::exec(TensorLayoutArray layouts) { auto trans_func = [handle](const TensorLayout& layout) { auto span = layout.span(); TensorND res; - res.raw_ptr = - static_cast(megdnn_malloc(handle, span.dist_byte())) + - span.low_byte; + res.reset_ptr( + (static_cast(megdnn_malloc(handle, span.dist_byte())) + + span.low_byte)); res.layout = layout; return res; }; @@ -71,7 +71,8 @@ float ROCMBenchmarker::exec(TensorLayoutArray layouts) { rng = m_default_rng.get(); auto size = tensor.layout.span().high_byte; rng->gen(tensor); - megdnn_memcpy_H2D(m_handle_rocm, tensors_cur[i].raw_ptr, tensor.raw_ptr, size); + megdnn_memcpy_H2D( + m_handle_rocm, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size); } m_device_timer.reset(); m_device_timer.start(); @@ -83,7 +84,7 @@ float ROCMBenchmarker::exec(TensorLayoutArray layouts) { } auto free = [](Handle* handle, TensorNDArray& tensors) { std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) { - megdnn_free(handle, tensor.raw_ptr); + megdnn_free(handle, tensor.raw_ptr()); }); }; free(m_handle_rocm, tensors_cur); diff --git a/dnn/test/rocm/checksum.cpp b/dnn/test/rocm/checksum.cpp index 45aa7a44..fb5967f1 100644 --- a/dnn/test/rocm/checksum.cpp +++ b/dnn/test/rocm/checksum.cpp @@ -25,7 +25,7 @@ TEST_F(ROCM, CHECKSUM_FORWARD) { auto aligned_size = size + ((512 - size % 512) % 512); auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { TensorND tensor; - tensor.raw_ptr = ptr; + tensor.reset_ptr(ptr); tensor.layout.init_contiguous_stride({size}); tensor.layout.dtype = dtype::Byte(); WorkspaceWrapper workspace( diff --git a/dnn/test/rocm/pooling.cpp b/dnn/test/rocm/pooling.cpp index 4275def6..11382831 100644 --- a/dnn/test/rocm/pooling.cpp +++ b/dnn/test/rocm/pooling.cpp @@ -57,8 +57,8 @@ TEST_F(ROCM, POOLING_BACKWARD) { auto&& tensors_rocm = *tensors_rocm_storage; auto span = tensors_rocm[0].layout.span(); - auto dst = static_cast(tensors_rocm[0].raw_ptr) + span.low_byte; - auto src = static_cast(tensors_orig[0].raw_ptr) + + auto dst = static_cast(tensors_rocm[0].raw_ptr()) + span.low_byte; + auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_rocm(), dst, src, span.dist_byte()); @@ -70,8 +70,9 @@ TEST_F(ROCM, POOLING_BACKWARD) { megdnn_free(handle_rocm(), workspace_rocm); span = tensors_rocm[1].layout.span(); - dst = static_cast(tensors_orig[1].raw_ptr) + span.low_byte; - src = static_cast(tensors_rocm[1].raw_ptr) + span.low_byte; + dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; + src = static_cast(tensors_rocm[1].raw_ptr()) + + span.low_byte; megdnn_memcpy_D2H(handle_rocm(), dst, src, span.dist_byte()); }; diff --git a/dnn/test/x86/add_update.cpp b/dnn/test/x86/add_update.cpp index 17e1a50e..f6f3a4ff 100644 --- a/dnn/test/x86/add_update.cpp +++ b/dnn/test/x86/add_update.cpp @@ -11,8 +11,8 @@ #include "test/common/checker.h" #include "test/common/resize.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -37,6 +37,27 @@ TEST_F(X86, ADD_UPDATE) { .execs({{2, 3, 4}, {1}}); } +TEST_F(X86, ADD_UPDATE_RECORD) { + TaskRecordChecker checker(0); + + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({{1, 3, 5, 5}, {1, 3, 5, 5}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({{2, 3, 4}, {2, 3, 4}}); + checker.set_param({2, -1, 3}) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({{2, 3, 2}, {2, 3, 2}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({{1, 3, 5, 5}, {1, 3, 1, 1}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({{2, 3, 4}, {1}}); +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index 333ac6e8..f65e9100 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -18,6 +18,7 @@ #include "test/common/checker.h" #include "test/common/conv_bias.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/common/workspace_wrapper.h" namespace megdnn { @@ -42,6 +43,25 @@ TEST_F(X86, CONV_BIAS_FORWARD) { } } +TEST_F(X86, CONV_BIAS_FORWARD_RECORD) { + using namespace conv_bias; + std::vector args = get_args(); + TaskRecordChecker checker(0); + NormalRNG default_rng; + ConstValue const_val; + for (auto&& arg : args) { + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_rng(2, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + static void avx2_chanwise_direct_int8x8x32( Handle* handle, uint32_t stride, const char* algo) { using namespace conv_bias; @@ -99,6 +119,49 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) { handle(), 1, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"); } +TEST_F(X86, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32_RECORD) { + using namespace conv_bias; + std::vector args; + size_t stride = 1; + auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p, + NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = stride; + param.stride_w = stride; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + param.sparse = param::ConvBias::Sparse::GROUP; + //! no bias + args.emplace_back( + param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel}, + TensorShape{}); + //! bias channel + args.emplace_back( + param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel}, + TensorShape{1, ic, 1, 1}); + }; + + run(5, 16, 7, 2, 1, NonlineMode::IDENTITY); + + TaskRecordChecker checker(0); + UniformIntRNG rng{-50, 50}; + checker.set_dtype(0, dtype::Int8()) + .set_dtype(1, dtype::Int8()) + .set_dtype(2, dtype::Int32()) + .set_dtype(4, dtype::Int32()) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_epsilon(1e-3); + for (auto&& arg : args) { + checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE2_INT8x8x32) { avx2_chanwise_direct_int8x8x32( handle(), 2, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"); @@ -1094,6 +1157,44 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) { #undef cb } +TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_RECORD) { + using namespace conv_bias; + std::vector args; + + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p, + NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + //! no bias + args.emplace_back( + param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, + TensorShape{}); + args.emplace_back( + param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, 1, 1}); + args.emplace_back( + param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, + TensorShape{ + 1, oc, (h + 2 * p - kernel) / param.stride_h + 1, + (w + 2 * p - kernel) / param.stride_w + 1}); + }; + for (NonlineMode nonline_mode : {NonlineMode::IDENTITY, NonlineMode::RELU}) { + run(1, 1, 24, 24, 2, 2, nonline_mode); + } + + TaskRecordChecker checker(0); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) { using namespace conv_bias; std::vector args; diff --git a/dnn/test/x86/convolution.cpp b/dnn/test/x86/convolution.cpp index 226f0857..c4b79b48 100644 --- a/dnn/test/x86/convolution.cpp +++ b/dnn/test/x86/convolution.cpp @@ -18,9 +18,9 @@ #include "test/common/checker.h" #include "test/common/convolution.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/common/workspace_wrapper.h" - namespace { #if MEGDNN_X86_WITH_MKL_DNN struct ConvArg { @@ -140,6 +140,38 @@ TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE1) { } } +TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE1_RECORD) { + using namespace convolution; + std::vector args; + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::Convolution param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + + args.emplace_back( + param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}); + }; + run(1, 1, 20, 20, 3, 2); + + TaskRecordChecker checker(0); + checker.set_epsilon(1); + UniformIntRNG rng{-50, 50}; + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng); + + for (auto&& arg : args) { + checker.set_param(arg.param).exec({arg.src, arg.filter, {}}); + } +} + TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) { using namespace convolution; std::vector args; diff --git a/dnn/test/x86/cvt_color.cpp b/dnn/test/x86/cvt_color.cpp index 7855bc71..8ff37f9c 100644 --- a/dnn/test/x86/cvt_color.cpp +++ b/dnn/test/x86/cvt_color.cpp @@ -15,6 +15,7 @@ #include "test/x86/fixture.h" +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -33,6 +34,16 @@ TEST_F(X86, CVTCOLOR) { } } +TEST_F(X86, CVTCOLOR_RECORD) { + using namespace cvt_color; + std::vector args = get_args(); + TaskRecordChecker checker(0); + checker.set_param(args[0].param) + .set_dtype(0, args[0].dtype) + .set_dtype(1, args[0].dtype) + .execs({args[0].src, {}}); +} + #ifdef MEGDNN_WITH_BENCHMARK TEST_F(X86, BENCHMARK_CVTCOLOR_RGB2GRAY) { using namespace cvt_color; diff --git a/dnn/test/x86/elemwise.cpp b/dnn/test/x86/elemwise.cpp index 77e4ceee..dffb89f5 100644 --- a/dnn/test/x86/elemwise.cpp +++ b/dnn/test/x86/elemwise.cpp @@ -12,6 +12,7 @@ #include "megdnn/oprs.h" #include "test/common/checker.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" using namespace megdnn; @@ -225,5 +226,60 @@ TYPED_TEST_CASE(X86_ELEMWISE, elemwise::test_types); TYPED_TEST(X86_ELEMWISE, run) { elemwise::run_test(this->handle()); } +#undef UNARY_TEST_CASE +#undef BUILD_UNARY_TEST_CASE_FLOAT + +#define UNARY_TEST_CASE(_optr) checker.set_param(Mode::_optr).execs({{1, 155}, {}}); + +#define BUILD_UNARY_TEST_CASE_FLOAT UNARY_TEST_CASE(ABS) + +TEST_F(X86, ELEMWISE_UNARY_RECORD) { + using Mode = ElemwiseForward::Param::Mode; + TaskRecordChecker checker(0); + // case float + UniformFloatRNG rng(1e-2, 6e1); + checker.set_rng(0, &rng); + checker.set_epsilon(1e-6); + checker.set_dtype(0, dtype::Float32()); + BUILD_UNARY_TEST_CASE_FLOAT +} + +#undef BINARY_COMPLATE_TEST_CASE +#undef BUILD_BINARY_COMPLATE_TEST_CASE_FLOAT32 + +#define BINARY_COMPLATE_TEST_CASE(_optr) \ + checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {}}); + +#define BUILD_BINARY_COMPLATE_TEST_CASE_FLOAT32 BINARY_COMPLATE_TEST_CASE(ADD) + +TEST_F(X86, ELEMWISE_BINARY_RECORD) { + using Mode = ElemwiseForward::Param::Mode; + TaskRecordChecker checker(0); + + // case float + UniformFloatRNG rng(1e-5, 7e1); + checker.set_rng(0, &rng); + checker.set_epsilon(1e-5); + checker.set_dtype(0, dtype::Float32()); + checker.set_dtype(1, dtype::Float32()); + BUILD_BINARY_COMPLATE_TEST_CASE_FLOAT32 +} +#undef TERNARY_COMPLATE_TEST_CASE +#undef BUILD_TERNARY_COMPLATE_TEST_CASE +#define TERNARY_COMPLATE_TEST_CASE(_optr) \ + checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}}); + +#define BUILD_TERNARY_COMPLATE_TEST_CASE TERNARY_COMPLATE_TEST_CASE(FUSE_MUL_ADD3) + +TEST_F(X86, ELEMWISE_TERNARY_RECORD) { + using Mode = ElemwiseForward::Param::Mode; + TaskRecordChecker checker(0); + // case int + checker.set_dtype(0, dtype::Int8()); + checker.set_dtype(1, dtype::Int8()); + checker.set_dtype(2, dtype::Int8()); + // BUILD_TERNARY_TEST_CASE + BUILD_TERNARY_COMPLATE_TEST_CASE +} // vim: syntax=cpp.doxygen diff --git a/dnn/test/x86/elemwise_multi_type.cpp b/dnn/test/x86/elemwise_multi_type.cpp index c1ac430d..8af2bc73 100644 --- a/dnn/test/x86/elemwise_multi_type.cpp +++ b/dnn/test/x86/elemwise_multi_type.cpp @@ -12,10 +12,10 @@ #include "test/common/elemwise_multi_type.h" #include "megdnn/oprs.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/common/timer.h" #include "test/common/workspace_wrapper.h" #include "test/x86/fixture.h" - using namespace megdnn; using namespace test; @@ -80,6 +80,53 @@ TEST_F(X86, ELEMWISE_QUANTIZED_MODE_UNARY) { } } +TEST_F(X86, ELEMWISE_QUANTIZED_MODE_UNARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + + std::unique_ptr rng; + for (auto mode : + {Mode::QRELU, Mode::QABS, Mode::QSIGMOID, Mode::QEXP, Mode::QTANH, + Mode::QFAST_TANH, Mode::QH_SWISH}) { + checker.set_param({mode}); + + for (DType src_type : std::vector{ + dtype::QuantizedS8(1.4f), + dtype::Quantized8Asymm(1.3f, static_cast(4)), + dtype::QuantizedS32(1.3f)}) { + checker.set_dtype(0, src_type); + if (src_type.enumv() == DTypeEnum::QuantizedS8) { + rng = std::make_unique(-127, 127); + checker.set_dtype(1, dtype::QuantizedS8(1.7f)); + } else if (src_type.enumv() == DTypeEnum::Quantized8Asymm) { + rng = std::make_unique(0, 255); + checker.set_dtype( + 1, dtype::Quantized8Asymm(1.7f, static_cast(10))); + } else { + rng = std::make_unique(INT16_MIN >> 1, INT16_MAX >> 1); + } + + checker.set_rng(0, rng.get()); + auto run = [&]() { + checker.execs({{3, 4, 5, 6}, {}}); + checker.execs({{3}, {}}); + }; + + if (src_type.enumv() == DTypeEnum::QuantizedS32) { + for (DType dst_type : std::vector{ + dtype::QuantizedS8(32718.6f), + dtype::Quantized8Asymm( + 32729.6f, static_cast(128))}) { + checker.set_dtype(1, dst_type); + run(); + } + } else { + run(); + } + } + } +} + TEST_F(X86, ELEMWISE_QUANTIZED_MODE_BINARY) { using Mode = ElemwiseMultiType::Param::Mode; Checker checker(handle()); @@ -238,6 +285,98 @@ TEST_F(X86, ELEMWISE_QUANTIZED_MODE_BINARY) { checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {}}); } +TEST_F(X86, ELEMWISE_QUANTIZED_MODE_BINARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + + // qint32 to qint8/quint8 + for (auto mode : {Mode::QADD, Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_H_SWISH}) { + checker.set_param({mode}); + UniformIntRNG rng{INT16_MIN >> 1, INT16_MAX >> 1}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_dtype(0, dtype::QuantizedS32(1.3f)) + .set_dtype(1, dtype::QuantizedS32(1.2f)); + + for (DType dst_type : std::vector{ + dtype::QuantizedS8(32718.6f), + dtype::Quantized8Asymm(32729.6f, static_cast(128))}) { + checker.set_dtype(2, dst_type); + + //! VEC + SCALAR + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + } + } + + for (auto mode : + {Mode::QMUL, Mode::QADD, Mode::QMIN, Mode::QMAX, Mode::QSUB, + Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_SIGMOID, Mode::QFUSE_ADD_H_SWISH}) { + checker.set_param({mode}); + + // qint8 to qint8 + UniformIntRNG rng_int8{-127, 127}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.35f)) + .set_dtype(1, dtype::QuantizedS8(1.15f)) + .set_dtype(2, dtype::QuantizedS8(1.75f)); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + + // quint8 to quint8 + UniformIntRNG rng_uint8{0, 255}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + } + + //! TRUE_DIV : 0.0 / 0.0 will fail + checker.set_param({Mode::QTRUE_DIV}); + UniformIntRNG rng_int8_1{-127, 127}; + UniformIntRNG rng_int8_2{-127, -1}; + checker.set_rng(0, &rng_int8_1) + .set_rng(1, &rng_int8_2) + .set_dtype(0, dtype::QuantizedS8(1.4f)) + .set_dtype(1, dtype::QuantizedS8(1.1f)) + .set_dtype(2, dtype::QuantizedS8(1.7f)); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + // quint8 to quint8 + UniformIntRNG rng_uint8_1{0, 255}; + UniformIntRNG rng_uint8_2{0, 127}; + checker.set_rng(0, &rng_uint8_1) + .set_rng(1, &rng_uint8_2) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + + //! TANH + checker.set_param({Mode::QFUSE_ADD_TANH}); + UniformIntRNG rng_int8{-5, 5}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.1f)) + .set_dtype(1, dtype::QuantizedS8(1.4f)) + .set_dtype(2, dtype::QuantizedS8(1.7f)); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + + UniformIntRNG rng_uint8{123, 133}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.1f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.4f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.7f, static_cast(128))); + + checker.execs({{3, 4, 5, 6}, {1, 1, 1, 1}, {}}); +} + TEST_F(X86, ELEMWISE_QUANTIZED_MODE_TERNARY) { using Mode = ElemwiseMultiType::Param::Mode; Checker checker(handle()); @@ -283,4 +422,37 @@ TEST_F(X86, ELEMWISE_QUANTIZED_MODE_TERNARY) { } } +TEST_F(X86, ELEMWISE_QUANTIZED_MODE_TERNARY_RECORD) { + using Mode = ElemwiseMultiType::Param::Mode; + TaskRecordChecker checker(0); + + for (auto mode : {Mode::QFUSE_MUL_ADD3}) { + checker.set_param({mode}); + + // qint8 to qint8 + UniformIntRNG rng_int8{-127, 127}; + checker.set_rng(0, &rng_int8) + .set_rng(1, &rng_int8) + .set_rng(2, &rng_int8) + .set_dtype(0, dtype::QuantizedS8(1.45f)) + .set_dtype(1, dtype::QuantizedS8(1.15f)) + .set_dtype(2, dtype::QuantizedS8(1.75f)) + .set_dtype(3, dtype::QuantizedS8(1.35f)); + + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + + // quint8 to quint8 + UniformIntRNG rng_uint8{0, 225}; + checker.set_rng(0, &rng_uint8) + .set_rng(1, &rng_uint8) + .set_rng(2, &rng_uint8) + .set_dtype(0, dtype::Quantized8Asymm(1.35f, static_cast(128))) + .set_dtype(1, dtype::Quantized8Asymm(1.15f, static_cast(128))) + .set_dtype(2, dtype::Quantized8Asymm(1.75f, static_cast(128))) + .set_dtype(3, dtype::Quantized8Asymm(1.45f, static_cast(128))); + + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {1, 1, 1, 1}, {}}); + } +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/test/x86/gaussian_blur.cpp b/dnn/test/x86/gaussian_blur.cpp index 71ac304f..f1451ea8 100644 --- a/dnn/test/x86/gaussian_blur.cpp +++ b/dnn/test/x86/gaussian_blur.cpp @@ -10,8 +10,8 @@ */ #include "test/common/gaussian_blur.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -36,6 +36,22 @@ TEST_F(X86, GAUSSIAN_BLUR) { } } +TEST_F(X86, GAUSSIAN_BLUR_RECORD) { + using namespace gaussian_blur; + std::vector args = get_args(); + TaskRecordChecker checker(0); + auto arg = args[0]; + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, {}}); + + checker.set_param(arg.param) + .set_epsilon(1 + 1e-3) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .execs({arg.src, {}}); +} } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/x86/lrn.cpp b/dnn/test/x86/lrn.cpp index 7e2642f8..39aa4292 100644 --- a/dnn/test/x86/lrn.cpp +++ b/dnn/test/x86/lrn.cpp @@ -12,7 +12,7 @@ #include "test/common/checker.h" #include "test/common/local.h" - +#include "test/common/task_record_check.h" namespace megdnn { namespace test { @@ -23,6 +23,13 @@ TEST_F(X86, LRN) { checker.execs({{2, w, 12, 13}, {}}); } } +TEST_F(X86, LRN_RECORD) { + TaskRecordChecker checker(0); + checker.execs({{2, 11, 12, 13}, {}}); + for (size_t w = 10; w <= 50; w += 10) { + checker.execs({{2, w, 12, 13}, {}}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/x86/matrix_mul.cpp b/dnn/test/x86/matrix_mul.cpp index a3cd9960..16d40156 100644 --- a/dnn/test/x86/matrix_mul.cpp +++ b/dnn/test/x86/matrix_mul.cpp @@ -16,10 +16,38 @@ #include "test/common/checker.h" #include "test/common/matrix_mul.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" using namespace megdnn; using namespace test; using namespace megdnn::x86; +TEST_F(X86, MATRIX_MUL_RECORD) { + TaskRecordChecker checker(0); + using Param = MatrixMul::Param; + auto args = matrix_mul::get_matmul_args(); + auto arg = args[0]; + auto m = arg.m, n = arg.n, k = arg.k; + auto mask = arg.mask; + Param param; + param.transposeA = mask & 1; + param.transposeB = mask & 2; + TensorShape AS, BS, CS; + if (param.transposeA) + AS = TensorShape{k, m}; + else + AS = TensorShape{m, k}; + if (param.transposeB) + BS = TensorShape{n, k}; + else + BS = TensorShape{k, n}; + CS = TensorShape{m, n}; + TensorLayout AL, BL, CL; + AL = TensorLayout(AS, dtype::Float32()); + BL = TensorLayout(BS, dtype::Float32()); + CL = TensorLayout(CS, dtype::Float32()); + checker.set_param(param); + checker.execl({AL, BL, CL}); +} #if MEGDNN_X86_WITH_VNNI TEST_F(X86, MATRIX_MUL_VNNI_8X8X32) { matrix_mul::check_matrix_mul( @@ -40,6 +68,7 @@ TEST_F(X86, MATRIX_MUL_MKLDNN_8X8X32) { dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle()); } } + #endif //! FIXME: need to add tests of GEMV and QUINT8 TEST_F(X86, MATRIX_MUL_AVX2_8X8X32) { diff --git a/dnn/test/x86/pooling.cpp b/dnn/test/x86/pooling.cpp index 2b9fad46..9940995b 100644 --- a/dnn/test/x86/pooling.cpp +++ b/dnn/test/x86/pooling.cpp @@ -11,8 +11,8 @@ #include "test/common/pooling.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -24,6 +24,14 @@ TEST_F(X86, POOLING) { } } +TEST_F(X86, POOLING_RECORD) { + auto args = pooling::get_args(); + for (auto&& arg : args) { + TaskRecordChecker checker(0); + checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); + } +} + TEST_F(X86, S1POOLING88) { Checker checker(handle()); auto run = [&](size_t WH, size_t WW, size_t PH, size_t PW, size_t SH, size_t SW, @@ -100,6 +108,17 @@ TEST_F(X86, POOLING88) { checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); } } +TEST_F(X86, POOLING88_RECORD) { + TaskRecordChecker checker(0); + auto args = pooling::get_args(); + for (auto&& arg : args) { + arg.ishape.ndim = 5; + arg.ishape[1] = (arg.ishape[1] + 7) / 8; + arg.ishape[4] = 8; + arg.param.format = param::Pooling::Format::NCHW88; + checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); + } +} TEST_F(X86_MULTI_THREADS, POOLING88) { Checker checker(handle()); auto args = pooling::get_args(); @@ -215,6 +234,16 @@ TEST_F(X86, POOLING_INT8) { checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); } } + +TEST_F(X86, POOLING_INT8_RECORD) { + auto args = pooling::get_args(); + for (auto&& arg : args) { + Checker checker(handle()); + auto rng = std::make_unique(-127, 127); + checker.set_dtype(0, dtype::Int8()).set_rng(0, rng.get()); + checker.set_param(arg.param).exec(TensorShapeArray{arg.ishape, {}}); + } +} TEST_F(X86_MULTI_THREADS, POOLING_INT8) { auto args = pooling::get_args(); for (auto&& arg : args) { diff --git a/dnn/test/x86/resize.cpp b/dnn/test/x86/resize.cpp index a282c81d..ec16b208 100644 --- a/dnn/test/x86/resize.cpp +++ b/dnn/test/x86/resize.cpp @@ -10,8 +10,8 @@ */ #include "test/common/resize.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -35,6 +35,22 @@ TEST_F(X86, RESIZE_CV) { .execs({arg.src, arg.dst}); } } +TEST_F(X86, RESIZE_CV_RECORD) { + using namespace resize; + std::vector args = get_cv_args(); + TaskRecordChecker checker(0); + auto arg = args[0]; + checker.set_param(arg.param) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Uint8()) + .set_epsilon(1 + 1e-3) + .execs({arg.src, arg.dst}); + + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .execs({arg.src, arg.dst}); +} } // namespace test } // namespace megdnn diff --git a/dnn/test/x86/separable_conv.cpp b/dnn/test/x86/separable_conv.cpp index 5c583428..0d83caf8 100644 --- a/dnn/test/x86/separable_conv.cpp +++ b/dnn/test/x86/separable_conv.cpp @@ -10,8 +10,8 @@ */ #include "test/common/separable_conv.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -24,6 +24,15 @@ TEST_F(X86, SEPARABLE_CONV) { checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); } } +TEST_F(X86, SEPARABLE_CONV_RECORD) { + using namespace separable_conv; + std::vector args = get_args(); + TaskRecordChecker checker(0); + + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); + } +} } // namespace test } // namespace megdnn diff --git a/dnn/test/x86/separable_filter.cpp b/dnn/test/x86/separable_filter.cpp index faca0026..c378b0c1 100644 --- a/dnn/test/x86/separable_filter.cpp +++ b/dnn/test/x86/separable_filter.cpp @@ -10,8 +10,8 @@ */ #include "test/common/separable_filter.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -36,6 +36,23 @@ TEST_F(X86, SEPARABLE_FILTER) { } } +TEST_F(X86, SEPARABLE_FILTER_RECORD) { + using namespace separable_filter; + std::vector args = get_args(); + TaskRecordChecker checker(0); + auto arg = args[0]; + ConstValue rng(2); + checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng); + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); + + checker.set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Uint8()) + .set_epsilon(1 + 1e-3); + checker.set_param(arg.param).execs({arg.src, arg.filter_x, arg.filter_y, {}}); +} + } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/x86/type_cvt.cpp b/dnn/test/x86/type_cvt.cpp index bb3c5680..7350ac22 100644 --- a/dnn/test/x86/type_cvt.cpp +++ b/dnn/test/x86/type_cvt.cpp @@ -11,8 +11,8 @@ #include "test/common/benchmarker.h" #include "test/common/checker.h" +#include "test/common/task_record_check.h" #include "test/x86/fixture.h" - namespace megdnn { namespace test { @@ -55,6 +55,45 @@ TEST_F(X86, TYPE_CVT) { checker.exec(TensorLayoutArray{non_contig_src, non_contig_dst}); } +TEST_F(X86, TYPE_CVT_RECORD) { + TaskRecordChecker checker(0); + NormalRNG rng(0, 127); + checker.set_rng(0, &rng); + + std::vector dtypes = { + dtype::Float32(), + dtype::Float16(), + dtype::Int32(), + dtype::Int16(), + dtype::Int8(), + dtype::Uint8(), + dtype::QuantizedS8(0.5f), + dtype::QuantizedS32(0.5f), + dtype::Quantized8Asymm(2.0f, static_cast(3))}; + + for (size_t size : {1, 7, 15, 33}) { + for (auto sdtype : dtypes) + for (auto ddtype : dtypes) { + checker.set_dtype(0, sdtype).set_dtype(1, ddtype).execs( + {{size}, {size}}); + TensorLayout non_contig_src( + {1, 10, 10, 12}, {10 * 10 * 18, 10 * 18, 18, 1}, sdtype); + TensorLayout non_contig_dst({1, 10, 10, 12}, ddtype); + checker.exec(TensorLayoutArray{non_contig_src, non_contig_dst}); + } + } + + for (size_t size : {1, 7, 15, 33}) { + checker.set_dtype(0, dtype::Uint16()) + .set_dtype(1, dtype::Float32()) + .execs({{size}, {size}}); + } + TensorLayout non_contig_src( + {1, 10, 10, 12}, {10 * 10 * 18, 10 * 18, 18, 1}, dtype::Uint16()); + TensorLayout non_contig_dst({1, 10, 10, 12}, dtype::Float32()); + checker.exec(TensorLayoutArray{non_contig_src, non_contig_dst}); +} + TEST_F(X86, TYPE_CVT_NO_CONTIGUOUS) { UniformFloatRNG init(0, 100); Checker checker(handle()); diff --git a/dnn/test/x86/warp_perspective.cpp b/dnn/test/x86/warp_perspective.cpp index 04ed7a11..482deebc 100644 --- a/dnn/test/x86/warp_perspective.cpp +++ b/dnn/test/x86/warp_perspective.cpp @@ -14,9 +14,9 @@ #include "test/common/checker.h" #include "test/common/random_state.h" #include "test/common/rng.h" +#include "test/common/task_record_check.h" #include "test/common/warp_affine.h" #include "test/common/warp_perspective.h" - namespace megdnn { namespace test { @@ -24,6 +24,24 @@ TEST_F(X86, WARP_PERSPECTIVE_MAT_IDX) { warp_perspective::run_mat_idx_test(handle()); } +TEST_F(X86, WARP_PERSPECTIVE_MAT_IDX_RECORD) { + constexpr int N_SRC = 5; + TaskRecordChecker checker(0); + WarpPerspectiveMatRNG mat_rng; + checker.set_rng(1, &mat_rng); + + UniformIntRNG mat_idx_rng{0, N_SRC - 1}; + checker.set_dtype(2, dtype::Int32()); + checker.set_rng(2, &mat_idx_rng); + + WarpPerspective::Param param; + param.bmode = WarpPerspective::Param::BorderMode::REFLECT; + param.imode = param::WarpPerspective::InterpolationMode::LINEAR; + checker.set_param(param); + checker.execs({{N_SRC, 3, 10, 11}, {2, 3, 3}, {2}, {2, 3, 11, 12}}); + checker.execs({{N_SRC, 14, 17, 13}, {123, 3, 3}, {123}, {123, 14, 16, 15}}); +} + TEST_F(X86_MULTI_THREADS, WARP_PERSPECTIVE_MAT_IDX) { warp_perspective::run_mat_idx_test(handle()); } @@ -51,6 +69,29 @@ TEST_F(X86_MULTI_THREADS, WARP_AFFINE_CV) { } } +TEST_F(X86_MULTI_THREADS, WARP_AFFINE_CV_RECORD) { + using namespace warp_affine; + std::vector args = get_cv_args(); + TaskRecordChecker checker(0); + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_epsilon(1 + 1e-3) + .set_dtype(0, dtype::Uint8()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Uint8()) + .execs({arg.src, arg.trans, arg.dst}); + } + + for (auto&& arg : args) { + checker.set_param(arg.param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .execs({arg.src, arg.trans, arg.dst}); + } +} + #if MEGDNN_WITH_BENCHMARK namespace { template