GitOrigin-RevId: 55fb2a9b25
release-1.2
@@ -87,6 +87,23 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, | |||||
return pooling2d::do_pooling2d_int8_ncdiv4hw4( | return pooling2d::do_pooling2d_int8_ncdiv4hw4( | ||||
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | ||||
kern_param, stream, static_cast<uint32_t>(param().mode)); | kern_param, stream, static_cast<uint32_t>(param().mode)); | ||||
} else if (param().format == Format::NCHW32) { | |||||
pooling2d::Param kern_param; | |||||
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], | |||||
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; | |||||
c = c * 32; | |||||
size_t ph = param().pad_h, pw = param().pad_w; | |||||
size_t window_h = param().window_h, window_w = param().window_w; | |||||
size_t sh = param().stride_h, sw = param().stride_w; | |||||
kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
kern_param.ph = ph, kern_param.pw = pw, | |||||
kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
kern_param.sh = sh, kern_param.sw = sw; | |||||
auto&& stream = cuda_stream(handle()); | |||||
return pooling2d::do_pooling2d_int8_ncdiv32hw32( | |||||
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | |||||
kern_param, stream, static_cast<uint32_t>(param().mode)); | |||||
} | } | ||||
auto handle = cudnn_handle(this->handle()); | auto handle = cudnn_handle(this->handle()); | ||||
setup_descs(src.layout, dst.layout); | setup_descs(src.layout, dst.layout); | ||||
@@ -413,6 +413,62 @@ __global__ void pooling2d_device_template_int8_ncdiv4hw4( | |||||
*(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res; | *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res; | ||||
} | } | ||||
template <typename Pooler> | |||||
__global__ void pooling2d_device_template_int8_ncdiv32hw32( | |||||
const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { | |||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x; | |||||
using ldg_type = typename Pooler::feed_type; | |||||
static int constexpr pack_size = 32; | |||||
static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); | |||||
static int constexpr ldg_width_bytes = sizeof(ldg_type); | |||||
static int constexpr section = pack_size / sizeof(ldg_type); | |||||
MEGDNN_STATIC_ASSERT( | |||||
ldg_width == 4, | |||||
"pooling2d (NCHW32) kernel must use 128bit width ldg instruction"); | |||||
const int c_packed = param.c / pack_size; | |||||
const int batch = tid / (param.ho * param.wo * c_packed * section); | |||||
const int batch_residual = | |||||
tid - batch * param.ho * param.wo * c_packed * section; | |||||
const int oc = batch_residual / (param.ho * param.wo * section); | |||||
const int oc_residual = batch_residual - oc * param.ho * param.wo * section; | |||||
const int oh = oc_residual / (param.wo * section); | |||||
const int oh_residual = (oc_residual - oh * param.wo * section); | |||||
const int ow = oh_residual / section; | |||||
const int sec = oh_residual - ow * section; | |||||
if (batch >= param.n || oc >= c_packed || oh >= param.ho || ow >= param.wo) | |||||
return; | |||||
const int in_batch_stride = param.hi * param.wi * param.c; | |||||
const int out_batch_stride = param.ho * param.wo * param.c; | |||||
const int in_channel_stride = param.hi * param.wi * pack_size; | |||||
const int out_channel_stride = param.ho * param.wo * pack_size; | |||||
const int8_t* __restrict__ g_src_ptr = src + batch * in_batch_stride + | |||||
oc * in_channel_stride + | |||||
sec * ldg_width_bytes; | |||||
int8_t* __restrict__ g_dst_ptr = | |||||
dst + batch * out_batch_stride + oc * out_channel_stride + | |||||
(oh * param.wo + ow) * pack_size + sec * ldg_width_bytes; | |||||
Pooler pooler(param.window_h * param.window_w); | |||||
pooler.init(); | |||||
for (int fh = 0; fh < param.window_h; fh++) { | |||||
uint32_t ih = oh * param.sh + fh - param.ph; | |||||
for (int fw = 0; fw < param.window_w; fw++) { | |||||
uint32_t iw = ow * param.sw + fw - param.pw; | |||||
if (ih < param.hi && iw < param.wi) { | |||||
const int8_t* __restrict__ cur_src_ptr = | |||||
g_src_ptr + (ih * param.wi + iw) * pack_size; | |||||
ldg_type sval = | |||||
__ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr)); | |||||
pooler.feed(sval); | |||||
} | |||||
} | |||||
} | |||||
ldg_type res = pooler.get_ans(); | |||||
*(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res; | |||||
} | |||||
}; // namespace | }; // namespace | ||||
void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src, | void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src, | ||||
@@ -494,4 +550,43 @@ void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, | |||||
kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param); | kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param); | ||||
after_kernel_launch(); | after_kernel_launch(); | ||||
} | } | ||||
void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, | |||||
int8_t* d_dst, | |||||
const Param& param, | |||||
cudaStream_t stream, | |||||
uint32_t mode) { | |||||
using Mode = megdnn::param_enumv::Pooling::Mode; | |||||
void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); | |||||
uint32_t vthreads = param.n * param.c * param.ho * param.wo / 16; | |||||
switch (mode) { | |||||
case Mode::MAX: | |||||
kern = pooling2d_device_template_int8_ncdiv32hw32< | |||||
MaxPooler<int8_t, int4>>; | |||||
break; | |||||
case Mode::AVERAGE: | |||||
kern = pooling2d_device_template_int8_ncdiv32hw32< | |||||
MeanIncludeRoundedPooler<int8_t, int4, int32_t>>; | |||||
break; | |||||
case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||||
kern = pooling2d_device_template_int8_ncdiv32hw32< | |||||
MeanExcludeRoundedPooler<int8_t, int4, int32_t>>; | |||||
break; | |||||
default: | |||||
megdnn_assert(false, "invalid pooling mode"); | |||||
} | |||||
uint32_t nr_threads = query_blocksize_for_kernel(kern); | |||||
nr_threads = std::min(nr_threads, vthreads); | |||||
uint32_t nr_blocks = DIVUP(vthreads, nr_threads); | |||||
kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param); | |||||
after_kernel_launch(); | |||||
} | |||||
#undef FEED1 | |||||
#undef FEED2 | |||||
#undef FEED3 | |||||
#undef ANS1 | |||||
#undef ANS2 | |||||
#undef ANS4 | |||||
// vim: syntax=cuda.doxygen | // vim: syntax=cuda.doxygen |
@@ -29,6 +29,9 @@ void do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, int8_t* d_dst, | |||||
const Param& param, cudaStream_t stream, | const Param& param, cudaStream_t stream, | ||||
uint32_t mode); | uint32_t mode); | ||||
void do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, int8_t* d_dst, | |||||
const Param& param, cudaStream_t stream, | |||||
uint32_t mode); | |||||
} // namespace pooling2d | } // namespace pooling2d | ||||
} // namespace cuda | } // namespace cuda | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -310,6 +310,26 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) { | |||||
} | } | ||||
} | } | ||||
TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) { | |||||
require_compute_capability(6, 1); | |||||
using Param = param::Pooling; | |||||
Checker<Pooling> checker(handle_cuda()); | |||||
Param param; | |||||
auto i8_min = std::numeric_limits<int8_t>().min(); | |||||
auto i8_max = std::numeric_limits<int8_t>().max(); | |||||
UniformIntRNG int_rng{i8_min, i8_max}; | |||||
checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | |||||
param.format = Param::Format::NCHW32; | |||||
for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, | |||||
Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { | |||||
param.mode = mode; | |||||
checker.set_epsilon(1e-3).set_rng(0, &int_rng); | |||||
checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); | |||||
checker.set_param(param).exec({{15, 8, 28, 28, 32}, {}}); | |||||
checker.set_param(param).exec({{30, 8, 28, 28, 32}, {}}); | |||||
} | |||||
} | |||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, BENCHMARK_POOLING_CHWN4) { | TEST_F(CUDA, BENCHMARK_POOLING_CHWN4) { | ||||
CUBenchmarker<Pooling> bencher(handle_cuda()); | CUBenchmarker<Pooling> bencher(handle_cuda()); | ||||
@@ -331,13 +351,17 @@ TEST_F(CUDA, BENCHMARK_POOLING_CHWN4) { | |||||
param.format = Param::Format::CHWN4; | param.format = Param::Format::CHWN4; | ||||
bencher.set_param(param); | bencher.set_param(param); | ||||
auto time_chwn4 = bencher.execs({{C / 4, H, W, N, 4}, {}}) / nr_times; | auto time_chwn4 = bencher.execs({{C / 4, H, W, N, 4}, {}}) / nr_times; | ||||
auto time_nchw32 = | |||||
bencher.execs({{N, C / 32, H, W, 32}, {}}) / nr_times; | |||||
size_t oh = infer_conv_shape(H, window, stride, padding), | size_t oh = infer_conv_shape(H, window, stride, padding), | ||||
ow = infer_conv_shape(W, window, stride, padding); | ow = infer_conv_shape(W, window, stride, padding); | ||||
float io = (N * C * H * W + N * C * oh * ow) * sizeof(int8_t); | float io = (N * C * H * W + N * C * oh * ow) * sizeof(int8_t); | ||||
printf("time(cudnn)=%.2f ms, time(chwn4)=%.2f ms, " | |||||
"bandwidth(cudnn)=%.2f Gb/s, bandwidth(chwn4)=%.2f Gb/s\n", | |||||
time_cudnn, time_chwn4, io / (1e6 * time_cudnn), | |||||
io / (1e6 * time_chwn4)); | |||||
printf("time(cudnn)=%.2f ms, time(chwn4)=%.2f ms, time(nchw32)=%.2f " | |||||
"ms, " | |||||
"bandwidth(cudnn)=%.2f Gb/s, bandwidth(chwn4)=%.2f Gb/s, " | |||||
"bandwidth(nchw32)=%.2f Gb/s\n", | |||||
time_cudnn, time_chwn4, time_nchw32, io / (1e6 * time_cudnn), | |||||
io / (1e6 * time_chwn4), io / (1e6 * time_nchw32)); | |||||
}; | }; | ||||
run_bench(64, 64, 112, 112, 2, 1, 2); | run_bench(64, 64, 112, 112, 2, 1, 2); | ||||
run_bench(256, 64, 112, 112, 2, 1, 2); | run_bench(256, 64, 112, 112, 2, 1, 2); | ||||
@@ -1090,14 +1090,24 @@ EnableTensorCorePass::make_tensorcore_converter() { | |||||
size_t nr_inps = opr->input().size(); | size_t nr_inps = opr->input().size(); | ||||
MGB_MARK_USED_VAR(nr_inps); | MGB_MARK_USED_VAR(nr_inps); | ||||
mgb_assert(nr_inps == 1); | mgb_assert(nr_inps == 1); | ||||
if (!opr->input(0)->shape().eq_shape(new_inp[0]->shape())) { | |||||
mgb_assert(opr->input(0)->shape().ndim == 5 && | |||||
opr->input(0)->shape()[4] == 4); | |||||
mgb_assert(new_inp[0]->shape().ndim == 5 && | |||||
new_inp[0]->shape()[4] == 32); | |||||
size_t nr_channels = opr->input(0)->shape()[1] * 4; | |||||
if (nr_channels % 32 == 0) { // use nchw32 format | |||||
VarNode* new_inp_var = new_inp[0]; | |||||
if (opr->input(0)->shape().eq_shape(new_inp[0]->shape())) { | |||||
new_inp_var = | |||||
RelayoutPlaceholder::make( | |||||
new_inp[0], RelayoutPlaceholder::LayoutType:: | |||||
NCHW4_TO_NCHW32) | |||||
.node(); | |||||
} else { | |||||
mgb_assert(opr->input(0)->shape().ndim == 5 && | |||||
opr->input(0)->shape()[4] == 4); | |||||
mgb_assert(new_inp[0]->shape().ndim == 5 && | |||||
new_inp[0]->shape()[4] == 32); | |||||
} | |||||
auto new_param = pooling.param(); | auto new_param = pooling.param(); | ||||
new_param.format = Format::NCHW32; | new_param.format = Format::NCHW32; | ||||
auto new_pooling = opr::PoolingForward::make(new_inp[0], new_param, | |||||
auto new_pooling = opr::PoolingForward::make(new_inp_var, new_param, | |||||
opr->config()); | opr->config()); | ||||
return new_pooling.node()->owner_opr(); | return new_pooling.node()->owner_opr(); | ||||
} | } | ||||
@@ -1989,6 +1989,74 @@ TEST(TestEnableTensorCore, ConvBiasWithZ) { | |||||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
} | } | ||||
TEST(TestEnableTensorCore, Pooling) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)), | |||||
w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)), | |||||
b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)), | |||||
z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW4; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
param.stride_h = param.stride_w = 1; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto y = opr::ConvBias::make(x, w, b, z, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
opr::Pooling::Param pool_param; | |||||
pool_param.format = opr::Pooling::Param::Format::NCHW4; | |||||
y = opr::Pooling::make(y, pool_param); | |||||
y = opr::TypeCvt::make(y, dtype::Float32()); | |||||
SymbolVar y_opt; | |||||
SymbolVar y_no_tc; | |||||
{ | |||||
auto options = gopt::OptimizeForInferenceOptions{}; | |||||
options.enable_fuse_conv_bias_nonlinearity().enable_nchw32(); | |||||
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
} | |||||
ASSERT_EQ(opr::Pooling::Param::Format::NCHW32, | |||||
find_opr<opr::Pooling>(y_opt).param().format); | |||||
{ | |||||
auto options = gopt::OptimizeForInferenceOptions{}; | |||||
options.enable_fuse_conv_bias_nonlinearity(); | |||||
unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); | |||||
} | |||||
HostTensorND host_y, host_y_opt; | |||||
auto func = graph->compile({make_callback_copy(y_no_tc, host_y), | |||||
make_callback_copy(y_opt, host_y_opt)}); | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | |||||
} | |||||
TEST(TestGoptInference, EnableTensorCore) { | TEST(TestGoptInference, EnableTensorCore) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||