GitOrigin-RevId: d1b95a6f01
release-1.5
@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { | |||
param::ConvBias::Format::CHWN4); | |||
} | |||
TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) { | |||
CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda()); | |||
size_t RUNS = 1000; | |||
benchmarker.set_display(false).set_times(RUNS); | |||
using namespace conv_bias; | |||
UniformIntRNG int_rng{-3, 3}; | |||
UniformIntRNG bias_rng{-50, 50}; | |||
ConvBias::Param param; | |||
param.format = ConvBias::Param::Format::NCHW4_NCHW; | |||
param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; | |||
benchmarker.set_before_exec_callback( | |||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||
"INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); | |||
benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||
.set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||
.set_dtype(2, dtype::Float32()) | |||
.set_dtype(3, dtype::Float32()) | |||
.set_dtype(4, dtype::Float32()) | |||
.set_rng(0, &int_rng) | |||
.set_rng(1, &int_rng) | |||
.set_param(param); | |||
auto run = [&](const TensorShapeArray& shapes) { | |||
auto time_in_ms = | |||
benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / | |||
RUNS; | |||
printf("src=%s, filter=%s, dst=%s, time=%.2f\n", | |||
shapes[0].to_string().c_str(), shapes[1].to_string().c_str(), | |||
shapes[2].to_string().c_str(), time_in_ms); | |||
}; | |||
run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
} | |||
#if CUDA_VERSION >= 10020 | |||
TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | |||
@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||
add_pass<RemoveRedundantTypeCvtPass>(); | |||
add_pass(FuseNCHW4Int8Preprocess::make()); | |||
add_pass<FuseWarpPerspectiveDimshufflePass>(); | |||
#if CUDA_VERSION >= 10020 | |||
add_pass<FoldingConvBiasDimshufflePass>(); | |||
#endif | |||
}); | |||
cb(chwn4, { | |||
add_pass<FuseConvBiasNonlinPass>(); | |||
@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||
add_pass<RemoveRedundantTypeCvtPass>(); | |||
add_pass(FuseNCHW4Int8Preprocess::make()); | |||
add_pass<FuseWarpPerspectiveDimshufflePass>(); | |||
#if CUDA_VERSION >= 10020 | |||
add_pass<FoldingConvBiasDimshufflePass>(); | |||
#endif | |||
}); | |||
cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | |||
@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { | |||
MIDOUT_E | |||
} | |||
#if CUDA_VERSION >= 10020 | |||
/* ==================== FoldingConvBiasDimshufflePass ================= */ | |||
const char* FoldingConvBiasDimshufflePass::name() const { | |||
return mgb_cstr_log("folding conv bias dimshuffle pass"); | |||
@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
return true; | |||
}; | |||
MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | |||
MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32); | |||
auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | |||
&try_conv_reformat_nchw42nchw32, | |||
&try_conv_reformat_nchw42nhwc, | |||
#if CUDA_VERSION >= 10020 | |||
&try_conv_reformat_nchw322nchw4, | |||
#endif | |||
&rewriter](OperatorNodeBase* opr) { | |||
if (!try_conv_dimshuffle_reshape_typecvt(opr) && | |||
!try_conv_reformat_nchw42nchw32(opr) && | |||
!try_conv_reformat_nchw42nhwc(opr) | |||
#if CUDA_VERSION >= 10020 | |||
&& !try_conv_reformat_nchw322nchw4(opr) | |||
#endif | |||
) { | |||
rewriter.auto_replace_outputs(opr); | |||
} | |||
@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
MIDOUT_E | |||
} | |||
#endif | |||
/* ==================== PaddingChannelPass ================= */ | |||
const char* PaddingChannelPass::name() const { | |||
@@ -16,6 +16,10 @@ | |||
#include "megbrain/opr/dnn/convolution.h" | |||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||
#if MGB_CUDA | |||
#include <cuda.h> | |||
#endif | |||
namespace mgb { | |||
namespace gopt { | |||
@@ -427,11 +431,13 @@ namespace gopt { | |||
void apply(OptState& opt) const override; | |||
}; | |||
#if CUDA_VERSION >= 10020 | |||
class FoldingConvBiasDimshufflePass final : public Pass { | |||
public: | |||
const char* name() const override; | |||
void apply(OptState& opt) const override; | |||
}; | |||
#endif | |||
/*! | |||
* \brief padding channel to enable fast int8/int4 support | |||
@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { | |||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | |||
} | |||
#if CUDA_VERSION >= 10020 | |||
TEST(TestGoptInference, FoldingConvDimshuffle) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | |||
} | |||
#if CUDA_VERSION >= 10020 | |||
TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||