diff --git a/dnn/src/cuda/relayout/kern_contiguous.cuh b/dnn/src/cuda/relayout/kern_contiguous.cuh index 8c0740f4..9d6872e0 100644 --- a/dnn/src/cuda/relayout/kern_contiguous.cuh +++ b/dnn/src/cuda/relayout/kern_contiguous.cuh @@ -232,7 +232,7 @@ DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch, m_param[1].layout) typedef OpCallerBinaryContiguous Caller; size_t size = m_param.size; int grid_size, block_size; - if (m_contiguous_size > 32) { + if (m_contiguous_size >= 32) { void (*fptr)(Caller, uint32_t, uint32_t, uint32_t, uint32_t); fptr = cuda_last_contiguous_large_kern; safe_size_in_kern(size); diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp index a55da934..dda9bb74 100644 --- a/dnn/src/cuda/relayout/opr_impl.cpp +++ b/dnn/src/cuda/relayout/opr_impl.cpp @@ -176,7 +176,8 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() { !has_negative_stride(lsrc) && !has_negative_stride(ldst)) { size_t contiguous_size = gcd(lsrc.shape[lsrc.ndim - 1], ldst.shape[ldst.ndim - 1]); - if (contiguous_size > 1) { + // FIXME: disable copy_last_contiguous when contiguous_size < 32 due to performance issue + if (contiguous_size >= 32) { copy_last_contiguous(m_dst, m_src, contiguous_size, m_opr->stream()); return true;