GitOrigin-RevId: d384b87f50
tags/v1.6.0-rc1
@@ -198,6 +198,9 @@ public: | |||||
protected: | protected: | ||||
//! get origin coord | //! get origin coord | ||||
std::pair<float, int> get_origin_coord(float scale, int size, int idx); | std::pair<float, int> get_origin_coord(float scale, int size, int idx); | ||||
//! get nearest index in src | |||||
int get_nearest_src(float scale, int size, int idx); | |||||
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | ||||
}; | }; | ||||
@@ -6,9 +6,11 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "megdnn/handle.h" | |||||
#include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
@@ -26,8 +28,9 @@ void ResizeBase::check_layout_fwd(const TensorLayout& src, | |||||
errmsg().c_str()); | errmsg().c_str()); | ||||
if (param().format == Param::Format::NCHW) { | if (param().format == Param::Format::NCHW) { | ||||
megdnn_assert(dst.shape[1] == src.shape[1], "%s", errmsg().c_str()); | megdnn_assert(dst.shape[1] == src.shape[1], "%s", errmsg().c_str()); | ||||
megdnn_assert(param().imode == | |||||
param::Resize::InterpolationMode::INTER_LINEAR); | |||||
auto imode = param().imode; | |||||
megdnn_assert(imode == param::Resize::InterpolationMode::INTER_LINEAR || | |||||
imode == param::Resize::InterpolationMode::NEAREST); | |||||
} else if (param().format == Param::Format::NHWC) { | } else if (param().format == Param::Format::NHWC) { | ||||
megdnn_assert(dst.shape[3] == src.shape[3], "%s", errmsg().c_str()); | megdnn_assert(dst.shape[3] == src.shape[3], "%s", errmsg().c_str()); | ||||
} else if (param().format == Param::Format::NCHW4) { | } else if (param().format == Param::Format::NCHW4) { | ||||
@@ -79,6 +82,9 @@ std::pair<float, int> ResizeBase::get_origin_coord(float scale, int size, | |||||
return {alpha, origin_idx}; | return {alpha, origin_idx}; | ||||
} | } | ||||
int ResizeBase::get_nearest_src(float scale, int size, int idx) { | |||||
return std::min(static_cast<int>(idx / scale), size - 1); | |||||
} | |||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -30,8 +30,9 @@ void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
size_t max_batch_size = max_batch_x_channel / C; | size_t max_batch_size = max_batch_x_channel / C; | ||||
while (N > 0) { | while (N > 0) { | ||||
size_t curr_batch_size = N > max_batch_size ? max_batch_size : N; | size_t curr_batch_size = N > max_batch_size ? max_batch_size : N; | ||||
resize::backward_data_proxy(diff_ptr, grad_ptr, curr_batch_size, C, IH, | |||||
IW, OH, OW, stream); | |||||
resize::backward_data_proxy(resize::get_imode(param().imode), diff_ptr, | |||||
grad_ptr, curr_batch_size, C, IH, IW, OH, | |||||
OW, stream); | |||||
if (N <= max_batch_size) { | if (N <= max_batch_size) { | ||||
break; | break; | ||||
@@ -17,9 +17,9 @@ namespace megdnn { | |||||
namespace cuda { | namespace cuda { | ||||
namespace resize { | namespace resize { | ||||
__global__ void resize_bwd_kernel(const float* hidden, float* dst, int N, int C, | |||||
int IH, int IW, int OH, int OW, float scale_h, | |||||
float scale_w) { | |||||
__global__ void resize_bwd_linear_kernel(const float* hidden, float* dst, int N, | |||||
int C, int IH, int IW, int OH, int OW, | |||||
float scale_h, float scale_w) { | |||||
int n = blockIdx.z; | int n = blockIdx.z; | ||||
int ow = blockIdx.x * blockDim.x + threadIdx.x; | int ow = blockIdx.x * blockDim.x + threadIdx.x; | ||||
int oh = blockIdx.y * blockDim.y + threadIdx.y; | int oh = blockIdx.y * blockDim.y + threadIdx.y; | ||||
@@ -51,8 +51,30 @@ __global__ void resize_bwd_kernel(const float* hidden, float* dst, int N, int C, | |||||
} | } | ||||
} | } | ||||
void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH, | |||||
int IW, int OH, int OW, cudaStream_t stream) { | |||||
__global__ void resize_bwd_nearest_kernel(const float* hidden, float* dst, | |||||
int N, int C, int IH, int IW, int OH, | |||||
int OW, float scale_h, | |||||
float scale_w) { | |||||
int n = blockIdx.z; | |||||
int ow = blockIdx.x * blockDim.x + threadIdx.x; | |||||
int oh = blockIdx.y * blockDim.y + threadIdx.y; | |||||
hidden += n * C * OH * OW; | |||||
dst += n * C * IH * IW; | |||||
if (ow < OW && oh < OH) { | |||||
int ih = get_nearest_src(scale_h, IH, oh); | |||||
int iw = get_nearest_src(scale_w, IW, ow); | |||||
for (int c = 0; c < C; ++c) { | |||||
atomicAdd(dst + ih * IW + iw, | |||||
hidden[oh * OW + ow]); | |||||
hidden += OH * OW; | |||||
dst += IH * IW; | |||||
} | |||||
} | |||||
} | |||||
void backward_data_proxy(InterpolationMode imode, const float* diff, | |||||
float* grad, int N, int C, int IH, int IW, int OH, | |||||
int OW, cudaStream_t stream) { | |||||
const int BY = 16, BX = 32; | const int BY = 16, BX = 32; | ||||
{ | { | ||||
dim3 threads(BX, BY); | dim3 threads(BX, BY); | ||||
@@ -61,8 +83,14 @@ void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH, | |||||
stream)); | stream)); | ||||
float scale_h = static_cast<float>(OH) / IH; | float scale_h = static_cast<float>(OH) / IH; | ||||
float scale_w = static_cast<float>(OW) / IW; | float scale_w = static_cast<float>(OW) / IW; | ||||
resize_bwd_kernel<<<blocks, threads, 0, stream>>>( | |||||
diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w); | |||||
if(imode == InterpolationMode::INTER_LINEAR) { | |||||
resize_bwd_linear_kernel<<<blocks, threads, 0, stream>>>( | |||||
diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w); | |||||
} | |||||
else if (imode == InterpolationMode::INTER_NEAREST) { | |||||
resize_bwd_nearest_kernel<<<blocks, threads, 0, stream>>>( | |||||
diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w); | |||||
} | |||||
} | } | ||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
@@ -28,6 +28,10 @@ __device__ inline void get_origin_coord(float scale, int size, int idx, | |||||
} | } | ||||
} | } | ||||
__device__ inline int get_nearest_src(float scale, int size, int idx) { | |||||
return min(static_cast<int>(idx / scale), size - 1); | |||||
} | |||||
} // namespace resize | } // namespace resize | ||||
} // namespace cuda | } // namespace cuda | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -20,16 +20,17 @@ namespace resize { | |||||
// all these kernels use bilinear interpolation | // all these kernels use bilinear interpolation | ||||
template <typename ctype> | template <typename ctype> | ||||
void forward_proxy(bool is_nhwc, const ctype* src, ctype* dst, int N, int C, | |||||
int IH, int IW, int OH, int OW, int S_IN, int S_IC, int S_IH, | |||||
int S_IW, cudaStream_t stream); | |||||
void forward_proxy(bool is_nhwc, InterpolationMode imode, const ctype* src, | |||||
ctype* dst, int N, int C, int IH, int IW, int OH, int OW, | |||||
int S_IN, int S_IC, int S_IH, int S_IW, cudaStream_t stream); | |||||
template <typename ctype> | template <typename ctype> | ||||
void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH, | void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH, | ||||
int IW, int OH, int OW, cudaStream_t stream); | int IW, int OH, int OW, cudaStream_t stream); | ||||
void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH, | |||||
int IW, int OH, int OW, cudaStream_t stream); | |||||
void backward_data_proxy(InterpolationMode imode, const float* diff, | |||||
float* grad, int N, int C, int IH, int IW, int OH, | |||||
int OW, cudaStream_t stream); | |||||
} // namespace resize | } // namespace resize | ||||
} // namespace cuda | } // namespace cuda | ||||
@@ -9,6 +9,7 @@ | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
*/ | */ | ||||
#include "src/common/cv/common.h" | #include "src/common/cv/common.h" | ||||
#include "src/common/cv/enums.h" | |||||
#include "src/cuda/handle.h" | #include "src/cuda/handle.h" | ||||
#include "src/cuda/resize/common.h" | #include "src/cuda/resize/common.h" | ||||
#include "src/cuda/resize/helper.h" | #include "src/cuda/resize/helper.h" | ||||
@@ -146,19 +147,23 @@ void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | |||||
C, IH, IW, OH, OW, stream); | C, IH, IW, OH, OW, stream); | ||||
return; | return; | ||||
} | } | ||||
megdnn_assert(param().imode == Param::InterpolationMode::LINEAR, | |||||
megdnn_assert(param().imode == Param::InterpolationMode::LINEAR || | |||||
param().imode == Param::InterpolationMode::NEAREST, | |||||
"unsupported interpolation mode for NCHW format"); | "unsupported interpolation mode for NCHW format"); | ||||
if (src.layout.dtype == dtype::Float32{}) { | if (src.layout.dtype == dtype::Float32{}) { | ||||
resize::forward_proxy(is_nhwc, src.ptr<dt_float32>(), | |||||
dst.ptr<dt_float32>(), src.layout[0], C, IH, IW, | |||||
OH, OW, S_IN, S_IC, S_IH, S_IW, stream); | |||||
resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)), | |||||
src.ptr<dt_float32>(), dst.ptr<dt_float32>(), | |||||
src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC, | |||||
S_IH, S_IW, stream); | |||||
} else if (src.layout.dtype == dtype::Uint8()) { | } else if (src.layout.dtype == dtype::Uint8()) { | ||||
resize::forward_proxy(is_nhwc, src.ptr<dt_uint8>(), dst.ptr<dt_uint8>(), | |||||
resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)), | |||||
src.ptr<dt_uint8>(), dst.ptr<dt_uint8>(), | |||||
src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC, | src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC, | ||||
S_IH, S_IW, stream); | S_IH, S_IW, stream); | ||||
} else if (src.layout.dtype == dtype::Int8()) { | } else if (src.layout.dtype == dtype::Int8()) { | ||||
resize::forward_proxy(is_nhwc, src.ptr<dt_int8>(), dst.ptr<dt_int8>(), | |||||
resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)), | |||||
src.ptr<dt_int8>(), dst.ptr<dt_int8>(), | |||||
src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC, | src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC, | ||||
S_IH, S_IW, stream); | S_IH, S_IW, stream); | ||||
} else { | } else { | ||||
@@ -32,9 +32,10 @@ struct DirectSrcVisitor { | |||||
}; | }; | ||||
template <typename ctype, typename SrcVisitor, typename OutputConverter> | template <typename ctype, typename SrcVisitor, typename OutputConverter> | ||||
__global__ void kern_general(SrcVisitor src, ctype* __restrict dst, int C, | |||||
int IH, int IW, int OH, int OW, int S_IN, int S_IC, | |||||
int S_IH, int S_IW, float scale_h, float scale_w) { | |||||
__global__ void kern_general_linear(SrcVisitor src, ctype* __restrict dst, | |||||
int C, int IH, int IW, int OH, int OW, | |||||
int S_IN, int S_IC, int S_IH, int S_IW, | |||||
float scale_h, float scale_w) { | |||||
OutputConverter output_converter; | OutputConverter output_converter; | ||||
int ow = blockIdx.x * blockDim.x + threadIdx.x; | int ow = blockIdx.x * blockDim.x + threadIdx.x; | ||||
int oh = blockIdx.y * blockDim.y + threadIdx.y; | int oh = blockIdx.y * blockDim.y + threadIdx.y; | ||||
@@ -65,6 +66,31 @@ __global__ void kern_general(SrcVisitor src, ctype* __restrict dst, int C, | |||||
} | } | ||||
template <typename ctype, typename SrcVisitor, typename OutputConverter> | template <typename ctype, typename SrcVisitor, typename OutputConverter> | ||||
__global__ void kern_general_nearest(SrcVisitor src, ctype* __restrict dst, | |||||
int C, int IH, int IW, int OH, int OW, | |||||
int S_IN, int S_IC, int S_IH, int S_IW, | |||||
float scale_h, float scale_w) { | |||||
OutputConverter output_converter; | |||||
int ow = blockIdx.x * blockDim.x + threadIdx.x; | |||||
int oh = blockIdx.y * blockDim.y + threadIdx.y; | |||||
const ctype* __restrict sptr = src.get(blockIdx.z, S_IN); | |||||
dst += blockIdx.z * C * OH * OW; | |||||
if (ow < OW && oh < OH) { | |||||
int ih = get_nearest_src(scale_h, IH, oh); | |||||
int iw = get_nearest_src(scale_w, IW, ow); | |||||
for (int c = 0; c < C; ++c) { | |||||
dst[oh * OW + ow] = output_converter( | |||||
sptr[ih * S_IH + iw * S_IW]); | |||||
sptr += S_IC; | |||||
dst += OH * OW; | |||||
} | |||||
} | |||||
} | |||||
template <typename ctype, typename SrcVisitor, typename OutputConverter> | |||||
__global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C, | __global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C, | ||||
int IH, int IW, int OH, int OW, float scale_h, | int IH, int IW, int OH, int OW, float scale_h, | ||||
float scale_w) { | float scale_w) { | ||||
@@ -94,9 +120,10 @@ __global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C, | |||||
} | } | ||||
template <typename ctype, typename SrcVisitor> | template <typename ctype, typename SrcVisitor> | ||||
void dispatch_with_visitor(bool is_nhwc, SrcVisitor src, ctype* dst, int N, | |||||
int C, int IH, int IW, int OH, int OW, int S_IN, | |||||
int S_IC, int S_IH, int S_IW, cudaStream_t stream) { | |||||
void dispatch_with_visitor(bool is_nhwc, InterpolationMode imode, | |||||
SrcVisitor src, ctype* dst, int N, int C, int IH, | |||||
int IW, int OH, int OW, int S_IN, int S_IC, int S_IH, | |||||
int S_IW, cudaStream_t stream) { | |||||
const int BY = 16, BX = 32; | const int BY = 16, BX = 32; | ||||
const int max_batch_size = 65535; | const int max_batch_size = 65535; | ||||
@@ -113,10 +140,19 @@ void dispatch_with_visitor(bool is_nhwc, SrcVisitor src, ctype* dst, int N, | |||||
<<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH, | <<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH, | ||||
OW, scale_h, scale_w); | OW, scale_h, scale_w); | ||||
} else { | } else { | ||||
kern_general<ctype, SrcVisitor, rounding::RoundingConverter<ctype>> | |||||
<<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH, | |||||
OW, S_IN, S_IC, S_IH, S_IW, | |||||
scale_h, scale_w); | |||||
if (imode == InterpolationMode::INTER_LINEAR) { | |||||
kern_general_linear<ctype, SrcVisitor, | |||||
rounding::RoundingConverter<ctype>> | |||||
<<<blocks, threads, 0, stream>>>( | |||||
src, dst, C, IH, IW, OH, OW, S_IN, S_IC, S_IH, | |||||
S_IW, scale_h, scale_w); | |||||
} else if (imode == InterpolationMode::INTER_NEAREST) { | |||||
kern_general_nearest<ctype, SrcVisitor, | |||||
rounding::RoundingConverter<ctype>> | |||||
<<<blocks, threads, 0, stream>>>( | |||||
src, dst, C, IH, IW, OH, OW, S_IN, S_IC, S_IH, | |||||
S_IW, scale_h, scale_w); | |||||
} | |||||
} | } | ||||
N -= curr_batch_size; | N -= curr_batch_size; | ||||
src.move_batch(curr_batch_size, C * IH * IW); | src.move_batch(curr_batch_size, C * IH * IW); | ||||
@@ -194,13 +230,14 @@ namespace cuda { | |||||
namespace resize { | namespace resize { | ||||
template <typename ctype> | template <typename ctype> | ||||
void forward_proxy(bool is_nhwc, const ctype* src, ctype* dst, int N, int C, | |||||
int IH, int IW, int OH, int OW, int S_IN, int S_IC, int S_IH, | |||||
int S_IW, cudaStream_t stream) { | |||||
void forward_proxy(bool is_nhwc, InterpolationMode imode, const ctype* src, | |||||
ctype* dst, int N, int C, int IH, int IW, int OH, int OW, | |||||
int S_IN, int S_IC, int S_IH, int S_IW, | |||||
cudaStream_t stream) { | |||||
DirectSrcVisitor<ctype> visitor; | DirectSrcVisitor<ctype> visitor; | ||||
visitor.ptr = src; | visitor.ptr = src; | ||||
dispatch_with_visitor(is_nhwc, visitor, dst, N, C, IH, IW, OH, OW, S_IN, | |||||
S_IC, S_IH, S_IW, stream); | |||||
dispatch_with_visitor(is_nhwc, imode, visitor, dst, N, C, IH, IW, OH, OW, | |||||
S_IN, S_IC, S_IH, S_IW, stream); | |||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
@@ -214,7 +251,7 @@ void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH, | |||||
} | } | ||||
#define INST(ctype) \ | #define INST(ctype) \ | ||||
template void forward_proxy(bool, const ctype*, ctype*, int, int, int, \ | |||||
template void forward_proxy(bool, InterpolationMode, const ctype*, ctype*, int, int, int, \ | |||||
int, int, int, int, int, int, int, \ | int, int, int, int, int, int, int, \ | ||||
cudaStream_t); | cudaStream_t); | ||||
INST(float) | INST(float) | ||||
@@ -116,7 +116,9 @@ void ResizeImpl::kern_fallback_nhwc(const KernParam<ctype>& kern_param) { | |||||
void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
if (param().format == param::Resize::Format::NCHW4) { | |||||
if (param().format == param::Resize::Format::NCHW4 || | |||||
(param().format == param::Resize::Format::NCHW && | |||||
param().imode == param::Resize::InterpolationMode::NEAREST)) { | |||||
naive::ResizeImpl::exec(src, dst, workspace); | naive::ResizeImpl::exec(src, dst, workspace); | ||||
return; | return; | ||||
} | } | ||||
@@ -10,12 +10,14 @@ | |||||
*/ | */ | ||||
#include "src/common/rounding_converter.cuh" | #include "src/common/rounding_converter.cuh" | ||||
#include "src/common/utils.cuh" | |||||
#include "src/naive/handle.h" | #include "src/naive/handle.h" | ||||
#include "src/naive/resize/opr_impl.h" | #include "src/naive/resize/opr_impl.h" | ||||
#include "src/naive/resize/resize_cv.h" | #include "src/naive/resize/resize_cv.h" | ||||
#include "midout.h" | #include "midout.h" | ||||
MIDOUT_DECL(megdnn_naive_resize_layout) | MIDOUT_DECL(megdnn_naive_resize_layout) | ||||
MIDOUT_DECL(megdnn_naive_resize_layout_nearest) | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace naive; | using namespace naive; | ||||
@@ -86,6 +88,28 @@ INST(dt_qint8); | |||||
INST(dt_quint8); | INST(dt_quint8); | ||||
#undef INST | #undef INST | ||||
template <typename ctype> | |||||
void ResizeImpl::kern_nchw_nearest (const KernParam<ctype>& kern_param) { | |||||
megdnn_assert(kern_param.format == Format::NCHW); | |||||
UNPACK_RESIZE_FWD_KERN_PARAM_WITH_STRIDE(kern_param); | |||||
float scale_h = static_cast<float>(OH) / IH; | |||||
float scale_w = static_cast<float>(OW) / IW; | |||||
rep(n, N) { | |||||
rep(oh, OH) rep(ow, OW) { | |||||
auto ih = get_nearest_src(scale_h, IH, oh); | |||||
auto iw = get_nearest_src(scale_w, IW, ow); | |||||
rep(c, static_cast<int>(C)) { | |||||
dptr[c * OH * OW + oh * OW + ow] = sptr[c * S_IC + ih * S_IH + iw * S_IW]; | |||||
} | |||||
} | |||||
sptr += S_IN; | |||||
dptr += C * OH * OW; | |||||
} | |||||
} | |||||
template <typename ctype> | template <typename ctype> | ||||
void ResizeImpl::kern_naive(const KernParam<ctype>& kern_param) { | void ResizeImpl::kern_naive(const KernParam<ctype>& kern_param) { | ||||
if (kern_param.format == Format::NHWC) { | if (kern_param.format == Format::NHWC) { | ||||
@@ -266,6 +290,39 @@ void ResizeImpl::kern_naive_nchw4(const KernParam<ctype>& kern_param) { | |||||
void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
if (param().format == param::Resize::Format::NCHW && | |||||
param().imode == param::Resize::InterpolationMode::NEAREST) { | |||||
#define cb(dt, ct, _midout_iv) \ | |||||
case DTypeTrait<dt>::enumv: { \ | |||||
MIDOUT_BEGIN(megdnn_naive_resize_layout_nearest, \ | |||||
midout_iv(_midout_iv)) { \ | |||||
auto kparam = KernParam<ct>::from_tensors(param().format, src, \ | |||||
dst, workspace); \ | |||||
MEGDNN_DISPATCH_CPU_KERN_OPR(kern_nchw_nearest(kparam)); \ | |||||
} \ | |||||
MIDOUT_END(); \ | |||||
return; \ | |||||
} | |||||
switch (src.layout.dtype.enumv()) { | |||||
cb(dtype::Float32, float, 0); | |||||
DNN_INC_FLOAT16(cb(dtype::Float16, dt_float16, 1)); | |||||
cb(dtype::Int8, int8_t, 2); | |||||
cb(dtype::QuantizedS8, int8_t, 3); | |||||
cb(dtype::Uint8, uint8_t, 4); | |||||
cb(dtype::Quantized8Asymm, uint8_t, 5); | |||||
default: | |||||
megdnn_throw(ssprintf("Unsupported input DType in Resize " | |||||
"NEAREST mode: %s", | |||||
src.layout.dtype.name()) | |||||
.c_str()); | |||||
return; | |||||
} | |||||
#undef cb | |||||
#undef cb | |||||
} | |||||
if ((param().format == param::Resize::Format::NCHW || | if ((param().format == param::Resize::Format::NCHW || | ||||
(src.layout[3] != 1 && src.layout[3] != 3) || | (src.layout[3] != 1 && src.layout[3] != 3) || | ||||
!is_nhwc_contig_wc(src.layout)) || | !is_nhwc_contig_wc(src.layout)) || | ||||
@@ -306,8 +363,8 @@ void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, | |||||
void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, | void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
check_exec(diff.layout, grad.layout, workspace.size); | check_exec(diff.layout, grad.layout, workspace.size); | ||||
megdnn_assert(param().format == param::WarpPerspective::Format::NCHW, | |||||
"invalid warp_perspective format"); | |||||
megdnn_assert(param().format == param::Resize::Format::NCHW, | |||||
"invalid resize format"); | |||||
const int N = grad.layout.shape[0], C = grad.layout.shape[1], | const int N = grad.layout.shape[0], C = grad.layout.shape[1], | ||||
IH = grad.layout.shape[2], IW = grad.layout.shape[3]; | IH = grad.layout.shape[2], IW = grad.layout.shape[3]; | ||||
const int OH = diff.layout.shape[2], OW = diff.layout.shape[3]; | const int OH = diff.layout.shape[2], OW = diff.layout.shape[3]; | ||||
@@ -321,28 +378,37 @@ void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
std::memset(sptr, 0, sizeof(float) * N * C * IH * IW); | std::memset(sptr, 0, sizeof(float) * N * C * IH * IW); | ||||
rep(n, N) { | rep(n, N) { | ||||
rep(oh, OH) rep(ow, OW) { | rep(oh, OH) rep(ow, OW) { | ||||
auto coord_h = get_origin_coord(scale_h, IH, oh); | |||||
auto coord_w = get_origin_coord(scale_w, IW, ow); | |||||
float alphah = coord_h.first; | |||||
float alphaw = coord_w.first; | |||||
int ih0 = coord_h.second; | |||||
int ih1 = ih0 + 1; | |||||
int iw0 = coord_w.second; | |||||
int iw1 = iw0 + 1; | |||||
rep(c, C) { | |||||
float hidden = hptr[c * OH * OW + oh * OW + ow]; | |||||
sptr[c * IH * IW + ih0 * IW + iw0] += | |||||
(1.0f - alphaw) * (1.0f - alphah) * hidden; | |||||
sptr[c * IH * IW + ih1 * IW + iw0] += | |||||
(1.0f - alphaw) * alphah * hidden; | |||||
sptr[c * IH * IW + ih0 * IW + iw1] += | |||||
alphaw * (1.0f - alphah) * hidden; | |||||
sptr[c * IH * IW + ih1 * IW + iw1] += | |||||
alphaw * alphah * hidden; | |||||
if(param().imode == InterpolationMode::INTER_LINEAR) { | |||||
auto coord_h = get_origin_coord(scale_h, IH, oh); | |||||
auto coord_w = get_origin_coord(scale_w, IW, ow); | |||||
float alphah = coord_h.first; | |||||
float alphaw = coord_w.first; | |||||
int ih0 = coord_h.second; | |||||
int ih1 = ih0 + 1; | |||||
int iw0 = coord_w.second; | |||||
int iw1 = iw0 + 1; | |||||
rep(c, C) { | |||||
float hidden = hptr[c * OH * OW + oh * OW + ow]; | |||||
sptr[c * IH * IW + ih0 * IW + iw0] += | |||||
(1.0f - alphaw) * (1.0f - alphah) * hidden; | |||||
sptr[c * IH * IW + ih1 * IW + iw0] += | |||||
(1.0f - alphaw) * alphah * hidden; | |||||
sptr[c * IH * IW + ih0 * IW + iw1] += | |||||
alphaw * (1.0f - alphah) * hidden; | |||||
sptr[c * IH * IW + ih1 * IW + iw1] += | |||||
alphaw * alphah * hidden; | |||||
} | |||||
} else if (param().imode == InterpolationMode::NEAREST) { | |||||
auto ih = get_nearest_src(scale_h, IH, oh); | |||||
auto iw = get_nearest_src(scale_w, IW, ow); | |||||
rep(c, static_cast<int>(C)) { | |||||
sptr[c * IH * IW + ih * IW + iw] += hptr[c * OH * OW + oh * OW + ow]; | |||||
} | |||||
} | } | ||||
else megdnn_throw("unsupported mode in ResizeBackwardImpl"); | |||||
} | } | ||||
sptr += C * IH * IW; | sptr += C * IH * IW; | ||||
hptr += C * OH * OW; | hptr += C * OH * OW; | ||||
@@ -47,6 +47,9 @@ private: | |||||
void kern_naive(const KernParam<ctype>& kern_param); | void kern_naive(const KernParam<ctype>& kern_param); | ||||
template <typename ctype> | template <typename ctype> | ||||
void kern_nchw_nearest(const KernParam<ctype>& kern_param); | |||||
template <typename ctype> | |||||
void kern_naive_nhwc(const KernParam<ctype>& kern_param); | void kern_naive_nhwc(const KernParam<ctype>& kern_param); | ||||
template <typename ctype> | template <typename ctype> | ||||
@@ -18,6 +18,8 @@ namespace megdnn { | |||||
namespace test { | namespace test { | ||||
namespace resize { | namespace resize { | ||||
using IMode = param::Resize::InterpolationMode; | |||||
struct TestArg { | struct TestArg { | ||||
param::Resize param; | param::Resize param; | ||||
TensorShape src; | TensorShape src; | ||||
@@ -62,17 +64,18 @@ static void set_nchw_args(std::vector<TestArg>& args) { | |||||
args.emplace_back(param, TensorShape{1, 2, 6, 8}, TensorShape{1, 2, 3, 4}); | args.emplace_back(param, TensorShape{1, 2, 6, 8}, TensorShape{1, 2, 3, 4}); | ||||
} | } | ||||
static inline std::vector<TestArg> get_args() { | |||||
static inline std::vector<TestArg> get_args(IMode imode = IMode::INTER_LINEAR) { | |||||
std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
set_nchw_args(args); | set_nchw_args(args); | ||||
if(imode == IMode::INTER_LINEAR) { | |||||
//! test NHWC with ch != 1 or ch != 3 | //! test NHWC with ch != 1 or ch != 3 | ||||
param::Resize param; | |||||
param.format = param::Resize::Format::NHWC; | |||||
param.imode = param::Resize::InterpolationMode::LINEAR; | |||||
args.emplace_back(param, TensorShape{2, 2, 3, 4}, TensorShape{2, 4, 6, 4}); | |||||
args.emplace_back(param, TensorShape{2, 4, 6, 4}, TensorShape{2, 2, 3, 4}); | |||||
param::Resize param; | |||||
param.format = param::Resize::Format::NHWC; | |||||
param.imode = imode; | |||||
args.emplace_back(param, TensorShape{2, 2, 3, 4}, TensorShape{2, 4, 6, 4}); | |||||
args.emplace_back(param, TensorShape{2, 4, 6, 4}, TensorShape{2, 2, 3, 4}); | |||||
} | |||||
return args; | return args; | ||||
} | } | ||||
@@ -9,6 +9,7 @@ | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
*/ | */ | ||||
#include "test/common/resize.h" | #include "test/common/resize.h" | ||||
#include "src/common/cv/enums.h" | |||||
#include "test/common/benchmarker.h" | #include "test/common/benchmarker.h" | ||||
#include "test/common/checker.h" | #include "test/common/checker.h" | ||||
#include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
@@ -42,30 +43,33 @@ TEST_F(CUDA, RESIZE_CV) { | |||||
TEST_F(CUDA, RESIZE_FORWARD) { | TEST_F(CUDA, RESIZE_FORWARD) { | ||||
using namespace resize; | using namespace resize; | ||||
std::vector<TestArg> args = get_args(); | |||||
Checker<Resize> checker(handle_cuda()); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Uint8()) | |||||
.set_dtype(1, dtype::Uint8()) | |||||
.execs({arg.src, arg.dst}); | |||||
} | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_epsilon(1e-3) | |||||
.execs({arg.src, arg.dst}); | |||||
} | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Int8()) | |||||
.set_dtype(1, dtype::Int8()) | |||||
.set_epsilon(1e-3) | |||||
.execs({arg.src, arg.dst}); | |||||
IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST}; | |||||
for (auto imode : modes) { | |||||
std::vector<TestArg> args = get_args(imode); | |||||
Checker<Resize> checker(handle_cuda()); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Uint8()) | |||||
.set_dtype(1, dtype::Uint8()) | |||||
.execs({arg.src, arg.dst}); | |||||
} | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_epsilon(1e-3) | |||||
.execs({arg.src, arg.dst}); | |||||
} | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param) | |||||
.set_dtype(0, dtype::Int8()) | |||||
.set_dtype(1, dtype::Int8()) | |||||
.set_epsilon(1e-3) | |||||
.execs({arg.src, arg.dst}); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -84,42 +88,48 @@ TEST_F(CUDA, RESIZE_NCHW4) { | |||||
} | } | ||||
TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) { | TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) { | ||||
param::Resize param; | |||||
param.format = param::Resize::Format::NCHW; | |||||
param.imode = param::Resize::InterpolationMode::LINEAR; | |||||
Checker<Resize> checker(handle_cuda()); | |||||
checker.set_epsilon(1 + 1e-3) | |||||
.set_param(param); | |||||
auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout, | |||||
TensorShape dst_shape, DType dtype) { | |||||
checker.set_dtype(0, dtype) | |||||
.set_dtype(1, dtype) | |||||
.execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}}); | |||||
}; | |||||
for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(), | |||||
dtype::Int8()}) { | |||||
run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype); | |||||
run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype); | |||||
run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype); | |||||
run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype); | |||||
run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype); | |||||
run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype); | |||||
IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST}; | |||||
for (auto imode : modes) { | |||||
param::Resize param; | |||||
param.format = param::Resize::Format::NCHW; | |||||
param.imode = imode; | |||||
Checker<Resize> checker(handle_cuda()); | |||||
checker.set_epsilon(1 + 1e-3) | |||||
.set_param(param); | |||||
auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout, | |||||
TensorShape dst_shape, DType dtype) { | |||||
checker.set_dtype(0, dtype) | |||||
.set_dtype(1, dtype) | |||||
.execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}}); | |||||
}; | |||||
for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(), | |||||
dtype::Int8()}) { | |||||
run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype); | |||||
run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype); | |||||
run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype); | |||||
run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype); | |||||
run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype); | |||||
run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype); | |||||
} | |||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, RESIZE_BACKWARD) { | TEST_F(CUDA, RESIZE_BACKWARD) { | ||||
Checker<ResizeBackward> checker(handle_cuda()); | |||||
param::Resize param; | |||||
param.format = param::Resize::Format::NCHW; | |||||
param.imode = param::Resize::InterpolationMode::LINEAR; | |||||
checker.set_param(param); | |||||
checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}}); | |||||
checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}}); | |||||
checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}}); | |||||
checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}}); | |||||
IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST}; | |||||
for (auto imode : modes) { | |||||
Checker<ResizeBackward> checker(handle_cuda()); | |||||
param::Resize param; | |||||
param.format = param::Resize::Format::NCHW; | |||||
param.imode = imode; | |||||
checker.set_param(param); | |||||
checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}}); | |||||
checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}}); | |||||
checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}}); | |||||
checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}}); | |||||
} | |||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
@@ -522,29 +522,13 @@ def interpolate( | |||||
if align_corners is None: | if align_corners is None: | ||||
align_corners = False | align_corners = False | ||||
if ( | |||||
size is not None | |||||
and scale_factor is None | |||||
and not align_corners | |||||
and mode == "bilinear" | |||||
and inp.ndim in [4, 5] | |||||
): | |||||
# fastpath for interpolate | |||||
op = builtin.Resize(imode="linear", format="NCHW") | |||||
shape = astensor1d(size, inp, dtype="int32", device=inp.device) | |||||
(result,) = apply(op, inp, shape) | |||||
return result | |||||
if mode == "linear": | if mode == "linear": | ||||
inp = expand_dims(inp, 3) | inp = expand_dims(inp, 3) | ||||
if inp.ndim != 4: | if inp.ndim != 4: | ||||
raise ValueError("shape of input tensor must correspond to the operartion mode") | raise ValueError("shape of input tensor must correspond to the operartion mode") | ||||
if size is None: | |||||
if scale_factor is None: | |||||
raise ValueError("scale_factor must not be None when size is None") | |||||
def get_dsize(scale_factor): | |||||
if isinstance(scale_factor, (float, int)): | if isinstance(scale_factor, (float, int)): | ||||
scale_factor = float(scale_factor) | scale_factor = float(scale_factor) | ||||
if mode == "linear": | if mode == "linear": | ||||
@@ -572,6 +556,13 @@ def interpolate( | |||||
for i in range(2) | for i in range(2) | ||||
) | ) | ||||
dsize = concat([dsize[0], dsize[1]], axis=0) | dsize = concat([dsize[0], dsize[1]], axis=0) | ||||
return dsize | |||||
if size is None: | |||||
if scale_factor is None: | |||||
raise ValueError("scale_factor must not be None when size is None") | |||||
dsize = get_dsize(scale_factor) | |||||
else: | else: | ||||
if scale_factor is not None: | if scale_factor is not None: | ||||
raise ValueError("scale_factor must be None when size is provided") | raise ValueError("scale_factor must be None when size is provided") | ||||
@@ -583,6 +574,15 @@ def interpolate( | |||||
raise ValueError("under linear mode, size can only be single value") | raise ValueError("under linear mode, size can only be single value") | ||||
dsize = size | dsize = size | ||||
if not align_corners and mode in ("bilinear", "nearest") and inp.ndim in [4, 5]: | |||||
# fastpath for interpolate | |||||
op = builtin.Resize( | |||||
imode="linear" if mode == "bilinear" else "nearest", format="NCHW" | |||||
) | |||||
shape = astensor1d(dsize, inp, dtype="int32", device=inp.device) | |||||
(result,) = apply(op, inp, shape) | |||||
return result | |||||
oh, ow = dsize[0], dsize[1] | oh, ow = dsize[0], dsize[1] | ||||
ih, iw = inp.shape[2], inp.shape[3] | ih, iw = inp.shape[2], inp.shape[3] | ||||
@@ -630,15 +630,10 @@ def interpolate( | |||||
if mode == "linear": | if mode == "linear": | ||||
ret = reshape(ret, ret.shape[0:3]) | ret = reshape(ret, ret.shape[0:3]) | ||||
else: | else: | ||||
# only NHWC format support "cubic" and "nearest" mode | |||||
# only NHWC format support "cubic" mode | |||||
assert mode == "bicubic" | |||||
inp = transpose(inp, (0, 2, 3, 1)) | inp = transpose(inp, (0, 2, 3, 1)) | ||||
ret = warp_perspective( | |||||
inp, | |||||
weight, | |||||
dsize, | |||||
format="NHWC", | |||||
interp_mode="cubic" if mode == "bicubic" else mode, | |||||
) | |||||
ret = warp_perspective(inp, weight, dsize, format="NHWC", interp_mode="cubic",) | |||||
ret = transpose(ret, (0, 3, 1, 2)) | ret = transpose(ret, (0, 3, 1, 2)) | ||||
return ret | return ret | ||||