GitOrigin-RevId: d1b95a6f01
release-1.5
@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { | |||||
param::ConvBias::Format::CHWN4); | param::ConvBias::Format::CHWN4); | ||||
} | } | ||||
TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) { | |||||
CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda()); | |||||
size_t RUNS = 1000; | |||||
benchmarker.set_display(false).set_times(RUNS); | |||||
using namespace conv_bias; | |||||
UniformIntRNG int_rng{-3, 3}; | |||||
UniformIntRNG bias_rng{-50, 50}; | |||||
ConvBias::Param param; | |||||
param.format = ConvBias::Param::Format::NCHW4_NCHW; | |||||
param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; | |||||
benchmarker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
"INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); | |||||
benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||||
.set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||||
.set_dtype(2, dtype::Float32()) | |||||
.set_dtype(3, dtype::Float32()) | |||||
.set_dtype(4, dtype::Float32()) | |||||
.set_rng(0, &int_rng) | |||||
.set_rng(1, &int_rng) | |||||
.set_param(param); | |||||
auto run = [&](const TensorShapeArray& shapes) { | |||||
auto time_in_ms = | |||||
benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / | |||||
RUNS; | |||||
printf("src=%s, filter=%s, dst=%s, time=%.2f\n", | |||||
shapes[0].to_string().c_str(), shapes[1].to_string().c_str(), | |||||
shapes[2].to_string().c_str(), time_in_ms); | |||||
}; | |||||
run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
} | |||||
#if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | ||||
@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||||
add_pass<RemoveRedundantTypeCvtPass>(); | add_pass<RemoveRedundantTypeCvtPass>(); | ||||
add_pass(FuseNCHW4Int8Preprocess::make()); | add_pass(FuseNCHW4Int8Preprocess::make()); | ||||
add_pass<FuseWarpPerspectiveDimshufflePass>(); | add_pass<FuseWarpPerspectiveDimshufflePass>(); | ||||
#if CUDA_VERSION >= 10020 | |||||
add_pass<FoldingConvBiasDimshufflePass>(); | add_pass<FoldingConvBiasDimshufflePass>(); | ||||
#endif | |||||
}); | }); | ||||
cb(chwn4, { | cb(chwn4, { | ||||
add_pass<FuseConvBiasNonlinPass>(); | add_pass<FuseConvBiasNonlinPass>(); | ||||
@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||||
add_pass<RemoveRedundantTypeCvtPass>(); | add_pass<RemoveRedundantTypeCvtPass>(); | ||||
add_pass(FuseNCHW4Int8Preprocess::make()); | add_pass(FuseNCHW4Int8Preprocess::make()); | ||||
add_pass<FuseWarpPerspectiveDimshufflePass>(); | add_pass<FuseWarpPerspectiveDimshufflePass>(); | ||||
#if CUDA_VERSION >= 10020 | |||||
add_pass<FoldingConvBiasDimshufflePass>(); | add_pass<FoldingConvBiasDimshufflePass>(); | ||||
#endif | |||||
}); | }); | ||||
cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | ||||
@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { | |||||
MIDOUT_E | MIDOUT_E | ||||
} | } | ||||
#if CUDA_VERSION >= 10020 | |||||
/* ==================== FoldingConvBiasDimshufflePass ================= */ | /* ==================== FoldingConvBiasDimshufflePass ================= */ | ||||
const char* FoldingConvBiasDimshufflePass::name() const { | const char* FoldingConvBiasDimshufflePass::name() const { | ||||
return mgb_cstr_log("folding conv bias dimshuffle pass"); | return mgb_cstr_log("folding conv bias dimshuffle pass"); | ||||
@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||||
return true; | return true; | ||||
}; | }; | ||||
MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | ||||
MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32); | |||||
auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | ||||
&try_conv_reformat_nchw42nchw32, | &try_conv_reformat_nchw42nchw32, | ||||
&try_conv_reformat_nchw42nhwc, | &try_conv_reformat_nchw42nhwc, | ||||
#if CUDA_VERSION >= 10020 | |||||
&try_conv_reformat_nchw322nchw4, | &try_conv_reformat_nchw322nchw4, | ||||
#endif | |||||
&rewriter](OperatorNodeBase* opr) { | &rewriter](OperatorNodeBase* opr) { | ||||
if (!try_conv_dimshuffle_reshape_typecvt(opr) && | if (!try_conv_dimshuffle_reshape_typecvt(opr) && | ||||
!try_conv_reformat_nchw42nchw32(opr) && | !try_conv_reformat_nchw42nchw32(opr) && | ||||
!try_conv_reformat_nchw42nhwc(opr) | !try_conv_reformat_nchw42nhwc(opr) | ||||
#if CUDA_VERSION >= 10020 | |||||
&& !try_conv_reformat_nchw322nchw4(opr) | && !try_conv_reformat_nchw322nchw4(opr) | ||||
#endif | |||||
) { | ) { | ||||
rewriter.auto_replace_outputs(opr); | rewriter.auto_replace_outputs(opr); | ||||
} | } | ||||
@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||||
MIDOUT_E | MIDOUT_E | ||||
} | } | ||||
#endif | |||||
/* ==================== PaddingChannelPass ================= */ | /* ==================== PaddingChannelPass ================= */ | ||||
const char* PaddingChannelPass::name() const { | const char* PaddingChannelPass::name() const { | ||||
@@ -16,6 +16,10 @@ | |||||
#include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
#if MGB_CUDA | |||||
#include <cuda.h> | |||||
#endif | |||||
namespace mgb { | namespace mgb { | ||||
namespace gopt { | namespace gopt { | ||||
@@ -427,11 +431,13 @@ namespace gopt { | |||||
void apply(OptState& opt) const override; | void apply(OptState& opt) const override; | ||||
}; | }; | ||||
#if CUDA_VERSION >= 10020 | |||||
class FoldingConvBiasDimshufflePass final : public Pass { | class FoldingConvBiasDimshufflePass final : public Pass { | ||||
public: | public: | ||||
const char* name() const override; | const char* name() const override; | ||||
void apply(OptState& opt) const override; | void apply(OptState& opt) const override; | ||||
}; | }; | ||||
#endif | |||||
/*! | /*! | ||||
* \brief padding channel to enable fast int8/int4 support | * \brief padding channel to enable fast int8/int4 support | ||||
@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { | |||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | ||||
} | } | ||||
#if CUDA_VERSION >= 10020 | |||||
TEST(TestGoptInference, FoldingConvDimshuffle) { | TEST(TestGoptInference, FoldingConvDimshuffle) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||||
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | ||||
} | } | ||||
#if CUDA_VERSION >= 10020 | |||||
TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||