GitOrigin-RevId: 7b77579acd
release-1.1
@@ -428,6 +428,11 @@ public: | |||||
void exec(const ExecArgs& args) const override; | void exec(const ExecArgs& args) const override; | ||||
const char* name() const override { return m_name.c_str(); } | const char* name() const override { return m_name.c_str(); } | ||||
bool is_reproducible() const override { return true; } | bool is_reproducible() const override { return true; } | ||||
size_t get_preprocess_workspace_in_bytes( | |||||
const SizeArgs& args) const override; | |||||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
const SizeArgs& args) const override; | |||||
void exec_preprocess(const ExecArgs& args) const override; | |||||
private: | private: | ||||
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | ||||
@@ -560,6 +565,11 @@ public: | |||||
const char* name() const override { return m_name.c_str(); } | const char* name() const override { return m_name.c_str(); } | ||||
bool is_reproducible() const override { return true; } | bool is_reproducible() const override { return true; } | ||||
static std::string to_string(AlgoParam algo_param); | static std::string to_string(AlgoParam algo_param); | ||||
size_t get_preprocess_workspace_in_bytes( | |||||
const SizeArgs& args) const override; | |||||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
const SizeArgs& args) const override; | |||||
void exec_preprocess(const ExecArgs& args) const override; | |||||
private: | private: | ||||
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | ||||
@@ -65,8 +65,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( | |||||
WorkspaceBundle | WorkspaceBundle | ||||
ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::get_workspace_bundle( | ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::get_workspace_bundle( | ||||
dt_byte* raw_ptr, const SizeArgs& args) const { | dt_byte* raw_ptr, const SizeArgs& args) const { | ||||
size_t ws_filter = args.filter_layout->span().dist_byte(); | |||||
return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||||
if (args.preprocessed_filter) { | |||||
return WorkspaceBundle{raw_ptr, {}}; | |||||
} else { | |||||
size_t ws_filter = args.filter_layout->span().dist_byte(); | |||||
return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||||
} | |||||
} | } | ||||
size_t | size_t | ||||
@@ -82,12 +86,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||||
auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), | UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), | ||||
param); | param); | ||||
auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
auto ws_filter = ws.get(0); | |||||
auto&& stream = cuda_stream(args.opr->handle()); | auto&& stream = cuda_stream(args.opr->handle()); | ||||
// reformat filter from nchw32 to chwn32 | |||||
{ | |||||
int8_t* filter_ptr = nullptr; | |||||
if (args.preprocessed_filter == nullptr) { | |||||
filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr); | |||||
// reformat filter from nchw32 to chwn32 | |||||
TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; | TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; | ||||
src.init_contiguous_stride(); | src.init_contiguous_stride(); | ||||
TensorLayout dst = src; | TensorLayout dst = src; | ||||
@@ -99,11 +103,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||||
TensorND ts_src, ts_dst; | TensorND ts_src, ts_dst; | ||||
ts_src.raw_ptr = args.filter_tensor->raw_ptr; | ts_src.raw_ptr = args.filter_tensor->raw_ptr; | ||||
ts_src.layout = src; | ts_src.layout = src; | ||||
ts_dst.raw_ptr = ws_filter; | |||||
ts_dst.raw_ptr = args.workspace.raw_ptr; | |||||
ts_dst.layout = dst; | ts_dst.layout = dst; | ||||
auto&& transpose = | auto&& transpose = | ||||
args.opr->handle()->create_operator<RelayoutForward>(); | args.opr->handle()->create_operator<RelayoutForward>(); | ||||
transpose->exec(ts_src, ts_dst); | transpose->exec(ts_src, ts_dst); | ||||
} else { | |||||
filter_ptr = reinterpret_cast<int8_t*>( | |||||
args.preprocessed_filter->tensors[0].raw_ptr); | |||||
} | } | ||||
ConvParam kern_param; | ConvParam kern_param; | ||||
@@ -131,8 +138,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||||
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | ||||
if (fh == 1 && fw == 1) { | if (fh == 1 && fw == 1) { | ||||
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< | cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< | ||||
false>(args.src_tensor->compatible_ptr<int8_t>(), | |||||
reinterpret_cast<int8_t*>(ws_filter), | |||||
false>(args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||||
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | ||||
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, | args.dst_tensor->compatible_ptr<int8_t>(), nullptr, | ||||
kern_param, nonlinear_mode, alpha, beta, gamma, | kern_param, nonlinear_mode, alpha, beta, gamma, | ||||
@@ -146,8 +152,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||||
stream); | stream); | ||||
} else { | } else { | ||||
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32<true>( | cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32<true>( | ||||
args.src_tensor->compatible_ptr<int8_t>(), | |||||
reinterpret_cast<int8_t*>(ws_filter), | |||||
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||||
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | ||||
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | ||||
nonlinear_mode, alpha, beta, gamma, dst_scale, | nonlinear_mode, alpha, beta, gamma, dst_scale, | ||||
@@ -167,6 +172,41 @@ std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string( | |||||
algo_param.threadblock_n, algo_param.threadblock_k, | algo_param.threadblock_n, algo_param.threadblock_k, | ||||
algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); | algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); | ||||
} | } | ||||
size_t ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm:: | |||||
get_preprocess_workspace_in_bytes(const SizeArgs& args) const { | |||||
return 0_z; | |||||
} | |||||
SmallVector<TensorLayout> ConvBiasForwardImpl:: | |||||
AlgoInt8NCHW32IMMAImplicitGemm::deduce_preprocessed_filter_layout( | |||||
const SizeArgs& args) const { | |||||
return {args.filter_layout->collapse_contiguous()}; | |||||
} | |||||
void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( | |||||
const ExecArgs& args) const { | |||||
using Format = Param::Format; | |||||
auto&& param = args.opr->param(); | |||||
auto&& fm = args.filter_meta; | |||||
UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||||
param); | |||||
TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; | |||||
src.init_contiguous_stride(); | |||||
TensorLayout dst = src; | |||||
dst.stride[0] = 32; | |||||
dst.stride[1] = co * fh * fw * 32; | |||||
dst.stride[2] = co * fw * 32; | |||||
dst.stride[3] = co * 32; | |||||
dst.stride[4] = 1; | |||||
TensorND ts_src, ts_dst; | |||||
ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||||
ts_src.layout = src; | |||||
ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||||
ts_dst.layout = dst; | |||||
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>(); | |||||
transpose->exec(ts_src, ts_dst); | |||||
} | |||||
#endif | #endif | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -62,8 +62,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||||
WorkspaceBundle | WorkspaceBundle | ||||
ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle( | ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle( | ||||
dt_byte* raw_ptr, const SizeArgs& args) const { | dt_byte* raw_ptr, const SizeArgs& args) const { | ||||
size_t ws_filter = args.filter_layout->span().dist_byte(); | |||||
return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||||
if (args.preprocessed_filter) { | |||||
return WorkspaceBundle{raw_ptr, {}}; | |||||
} else { | |||||
size_t ws_filter = args.filter_layout->span().dist_byte(); | |||||
return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||||
} | |||||
} | } | ||||
size_t | size_t | ||||
@@ -79,12 +83,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), | UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), | ||||
param); | param); | ||||
auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
auto ws_filter = ws.get(0); | |||||
auto&& stream = cuda_stream(args.opr->handle()); | auto&& stream = cuda_stream(args.opr->handle()); | ||||
// reformat filter from nchw4 to chwn4 | |||||
{ | |||||
int8_t* filter_ptr = nullptr; | |||||
if (args.preprocessed_filter == nullptr) { | |||||
filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr); | |||||
// reformat filter from nchw4 to chwn4 | |||||
TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; | TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; | ||||
src.init_contiguous_stride(); | src.init_contiguous_stride(); | ||||
TensorLayout dst = src; | TensorLayout dst = src; | ||||
@@ -92,11 +96,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
TensorND ts_src, ts_dst; | TensorND ts_src, ts_dst; | ||||
ts_src.raw_ptr = args.filter_tensor->raw_ptr; | ts_src.raw_ptr = args.filter_tensor->raw_ptr; | ||||
ts_src.layout = src; | ts_src.layout = src; | ||||
ts_dst.raw_ptr = ws_filter; | |||||
ts_dst.raw_ptr = args.workspace.raw_ptr; | |||||
ts_dst.layout = dst; | ts_dst.layout = dst; | ||||
auto&& transpose = | auto&& transpose = | ||||
args.opr->handle()->create_operator<RelayoutForward>(); | args.opr->handle()->create_operator<RelayoutForward>(); | ||||
transpose->exec(ts_src, ts_dst); | transpose->exec(ts_src, ts_dst); | ||||
} else { | |||||
filter_ptr = reinterpret_cast<int8_t*>( | |||||
args.preprocessed_filter->tensors[0].raw_ptr); | |||||
} | } | ||||
convolution::ConvParam kern_param; | convolution::ConvParam kern_param; | ||||
@@ -124,8 +131,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | ||||
if (fh == 1 && fw == 1) { | if (fh == 1 && fw == 1) { | ||||
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<false>( | cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<false>( | ||||
args.src_tensor->compatible_ptr<int8_t>(), | |||||
reinterpret_cast<int8_t*>(ws_filter), | |||||
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||||
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | ||||
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | ||||
nonlinear_mode, alpha, beta, gamma, dst_scale, | nonlinear_mode, alpha, beta, gamma, dst_scale, | ||||
@@ -138,8 +144,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
stream); | stream); | ||||
} else { | } else { | ||||
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<true>( | cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<true>( | ||||
args.src_tensor->compatible_ptr<int8_t>(), | |||||
reinterpret_cast<int8_t*>(ws_filter), | |||||
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||||
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | ||||
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | ||||
nonlinear_mode, alpha, beta, gamma, dst_scale, | nonlinear_mode, alpha, beta, gamma, dst_scale, | ||||
@@ -153,4 +158,35 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
} | } | ||||
} | } | ||||
size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
get_preprocess_workspace_in_bytes(const SizeArgs& args) const { | |||||
return 0_z; | |||||
} | |||||
SmallVector<TensorLayout> ConvBiasForwardImpl:: | |||||
AlgoInt8NCHW4DotProdImplicitGemm::deduce_preprocessed_filter_layout( | |||||
const SizeArgs& args) const { | |||||
return {args.filter_layout->collapse_contiguous()}; | |||||
} | |||||
void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( | |||||
const ExecArgs& args) const { | |||||
using Format = Param::Format; | |||||
auto&& param = args.opr->param(); | |||||
auto&& fm = args.filter_meta; | |||||
UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||||
param); | |||||
TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; | |||||
src.init_contiguous_stride(); | |||||
TensorLayout dst = src; | |||||
dst.stride[0] = 1, dst.stride[1] = dst[0]; | |||||
TensorND ts_src, ts_dst; | |||||
ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||||
ts_src.layout = src; | |||||
ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||||
ts_dst.layout = dst; | |||||
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>(); | |||||
transpose->exec(ts_src, ts_dst); | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -1084,6 +1084,42 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { | |||||
} | } | ||||
TEST_F(CUDA, CUTLASS_WEIGHT_PREPROCESS) { | |||||
require_compute_capability(6, 1); | |||||
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
handle_cuda()); | |||||
auto check = [&checker](const std::string& algo) { | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str())); | |||||
UniformIntRNG rng{-16, 16}; | |||||
UniformIntRNG bias_rng{-50, 50}; | |||||
UniformIntRNG const_rng{1, 1}; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &bias_rng) | |||||
.set_rng(3, &rng) | |||||
.set_dtype(0, dtype::QuantizedS8{1.2f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.3f}) | |||||
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) | |||||
.set_dtype(3, dtype::QuantizedS8{1.3f}) | |||||
.set_dtype(4, dtype::QuantizedS8{1.0f}) | |||||
.set_epsilon(1 + 1e-3) | |||||
.set_max_avg_error(1e-1) | |||||
.set_max_avg_biased_error(1e-3); | |||||
param::ConvBias param; | |||||
param.pad_h = param.pad_w = 1; | |||||
param.stride_h = param.stride_w = 2; | |||||
param.format = param::ConvBias::Format::NCHW4; | |||||
checker.set_param(param).execs({{16, 4, 14, 14, 4}, | |||||
{16, 4, 3, 3, 4}, | |||||
{1, 4, 1, 1, 4}, | |||||
{}, | |||||
{}}); | |||||
}; | |||||
check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32"); | |||||
check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8"); | |||||
} | |||||
#if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
/// \note: we only check several cases and block sizes in megdnn_test, the | /// \note: we only check several cases and block sizes in megdnn_test, the | ||||
/// full testcases are written in cutlass repository | /// full testcases are written in cutlass repository | ||||