GitOrigin-RevId: f9e69d4825
release-1.4
@@ -204,7 +204,7 @@ namespace megdnn { | |||||
DEF_KERN_FLOAT(ATAN2, atan2f(x, y)); | DEF_KERN_FLOAT(ATAN2, atan2f(x, y)); | ||||
DEF_KERN_FLOAT(H_SWISH_GRAD, | DEF_KERN_FLOAT(H_SWISH_GRAD, | ||||
x < -3.f ? 0.f : (x > 3.f ? y : (2.f * x + 3.f) / 6.f * y)); | |||||
x < -3.f ? (ctype)0.f : (ctype)(x > 3.f ? (ctype)y : (ctype)((2.f * x + 3.f) / 6.f * y))); | |||||
DEF_KERN_FLOAT(FUSE_ADD_H_SWISH, fuse_add_hswish(x, y)); | DEF_KERN_FLOAT(FUSE_ADD_H_SWISH, fuse_add_hswish(x, y)); | ||||
#undef KERN_SIG | #undef KERN_SIG | ||||
@@ -147,7 +147,7 @@ void chanwise::run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, | |||||
dim3 nr_block(param.src_chl, | dim3 nr_block(param.src_chl, | ||||
std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | ||||
uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | ||||
kern<<<nr_block, nr_thread, shared, stream>>>(src_grad, dst_grad, flt, | |||||
hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, src_grad, dst_grad, flt, | |||||
param); | param); | ||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
@@ -105,7 +105,7 @@ void chanwise::run_fwd(T* dst, const T* src, const T* flt, const Param& param, | |||||
dim3 nr_block(param.src_chl, | dim3 nr_block(param.src_chl, | ||||
std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | ||||
uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | ||||
kern<<<nr_block, nr_thread, shared, stream>>>(dst, src, flt, param); | |||||
hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, dst, src, flt, param); | |||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
@@ -314,7 +314,7 @@ void convolution::exec_inplace_matmul_fwd( | |||||
} else { \ | } else { \ | ||||
kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>; \ | kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>; \ | ||||
} \ | } \ | ||||
kptr<<<blocks, threads, 0, stream>>>( \ | |||||
hipLaunchKernelGGL(kptr, blocks, threads, 0, stream, \ | |||||
src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \ | src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \ | ||||
IW, OC, OH, OW, FH, FW, SH, SW, PH, PW); \ | IW, OC, OH, OW, FH, FW, SH, SW, PH, PW); \ | ||||
} else { \ | } else { \ | ||||
@@ -324,7 +324,7 @@ void convolution::exec_inplace_matmul_fwd( | |||||
} else { \ | } else { \ | ||||
kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>; \ | kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>; \ | ||||
} \ | } \ | ||||
kptr<<<blocks, threads, 0, stream>>>( \ | |||||
hipLaunchKernelGGL(kptr, blocks, threads, 0, stream, \ | |||||
src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \ | src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \ | ||||
OH, OW, FH, FW, SH, SW, PH, PW); \ | OH, OW, FH, FW, SH, SW, PH, PW); \ | ||||
} \ | } \ | ||||