Browse Source

refactor(dnn/cuda): misc

GitOrigin-RevId: 1f8f91a0cc
release-1.1
Megvii Engine Team 4 years ago
parent
commit
4aa277a203
8 changed files with 118 additions and 49 deletions
  1. +12
    -11
      dnn/src/cuda/conv_bias/algo.cpp
  2. +17
    -4
      dnn/src/cuda/conv_bias/algo.h
  3. +42
    -8
      dnn/src/cuda/conv_bias/opr_impl.cpp
  4. +3
    -10
      dnn/src/cuda/conv_bias/opr_impl.h
  5. +16
    -0
      dnn/test/common/comparator.inl
  6. +9
    -1
      dnn/test/common/conv_bias.cpp
  7. +6
    -5
      dnn/test/common/tensor.inl
  8. +13
    -10
      dnn/test/cuda/conv_bias_int8.cpp

+ 12
- 11
dnn/src/cuda/conv_bias/algo.cpp View File

@@ -104,20 +104,19 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {


ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack;


ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvBiasForwardImpl* o,
const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst)
ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(
ConvBiasForwardImpl* o, const TensorLayout& src,
const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter)
: SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias,
z, dst) {}
z, dst, preprocessed_filter) {}


ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(
ConvBiasForwardImpl* o, const TensorLayout& src, ConvBiasForwardImpl* o, const TensorLayout& src,
const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta,
const TensorLayout& bias, const TensorLayout& z, const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst)
const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter)
: BiasForwardSizeArgs{concrete_handle(o->handle()), : BiasForwardSizeArgs{concrete_handle(o->handle()),
&src, &src,
&filter, &filter,
@@ -126,14 +125,16 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(
filter_meta, filter_meta,
&dst, &dst,
o->param().nonlineMode}, o->param().nonlineMode},
opr{o} {}
opr{o},
preprocessed_filter{preprocessed_filter} {}


ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs(
ConvBiasForwardImpl* opr, _megdnn_tensor_in src, ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace)
_megdnn_tensor_out dst, _megdnn_workspace workspace,
const PreprocessedFilter* preprocessed_filter)
: SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout,
dst.layout),
dst.layout, preprocessed_filter),
src_tensor{&src}, src_tensor{&src},
filter_tensor{&filter}, filter_tensor{&filter},
bias_tensor{&bias}, bias_tensor{&bias},


+ 17
- 4
dnn/src/cuda/conv_bias/algo.h View File

@@ -41,16 +41,19 @@ public:
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs : public conv_bias::BiasForwardSizeArgs { struct SizeArgs : public conv_bias::BiasForwardSizeArgs {
ConvBiasForwardImpl* opr; ConvBiasForwardImpl* opr;

const PreprocessedFilter* preprocessed_filter;
std::string to_string() const; std::string to_string() const;
SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst);
const TensorLayout& z, const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter = nullptr);
SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
const TensorLayout& filter, const TensorLayout& filter,
const CanonizedFilterMeta& filter_meta, const CanonizedFilterMeta& filter_meta,
const TensorLayout& bias, const TensorLayout& z, const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst);
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter = nullptr);


void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const { void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const {
desc.set_conv_bias(*src_layout, filter_meta, *dst_layout, desc.set_conv_bias(*src_layout, filter_meta, *dst_layout,
@@ -69,11 +72,21 @@ public:
ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src, ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in filter, _megdnn_tensor_in bias,
_megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_tensor_in z, _megdnn_tensor_out dst,
_megdnn_workspace workspace);
_megdnn_workspace workspace,
const PreprocessedFilter* preprocessed_filter = nullptr);
}; };
virtual bool is_available(const SizeArgs& args) const = 0; virtual bool is_available(const SizeArgs& args) const = 0;
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs& args) const = 0; virtual void exec(const ExecArgs& args) const = 0;
virtual size_t get_preprocess_workspace_in_bytes(
const SizeArgs& args) const {
return 0;
}
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const SizeArgs& args) const {
return {};
}
virtual void exec_preprocess(const ExecArgs& args) const {}


bool is_available_wk(const SizeArgs& args, size_t limit) { bool is_available_wk(const SizeArgs& args, size_t limit) {
return is_available(args) && get_workspace_in_bytes(args) <= limit; return is_available(args) && get_workspace_in_bytes(args) <= limit;


+ 42
- 8
dnn/src/cuda/conv_bias/opr_impl.cpp View File

@@ -29,7 +29,8 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_workspace workspace) { _megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size, preprocessed_filter); workspace.size, preprocessed_filter);
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace);
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace,
preprocessed_filter);
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout,
z.layout, dst.layout); z.layout, dst.layout);
algo->check_workspace(args, workspace).exec(args); algo->check_workspace(args, workspace).exec(args);
@@ -205,17 +206,50 @@ const char* ConvBiasForwardImpl::get_algorithm_set_name() const {
return "CONV_BIAS_CUDA"; return "CONV_BIAS_CUDA";
} }


size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter*) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
size_t ConvBiasForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
AlgoBase::SizeArgs args{
this, src, filter, bias, z, dst, preprocessed_filter};
return get_algorithm(this, src, filter, bias, z, dst) return get_algorithm(this, src, filter, bias, z, dst)
->get_workspace_in_bytes(args); ->get_workspace_in_bytes(args);
}; };


size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->get_preprocess_workspace_in_bytes(args);
}

SmallVector<TensorLayout>
ConvBiasForwardImpl::deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->deduce_preprocessed_filter_layout(args);
}

void ConvBiasForwardImpl::exec_preprocess(
const TensorLayout& src_layout, _megdnn_tensor_in filter,
const TensorLayout& bias_layout, const TensorLayout& z_layout,
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout},
z{nullptr, z_layout}, bias{nullptr, bias_layout};
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace,
preprocessed_filter);
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout,
z.layout, dst.layout);
return algo->exec_preprocess(args);
}

} // namespace cuda } // namespace cuda
} // namespace megdnn } // namespace megdnn




+ 3
- 10
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -44,21 +44,14 @@ public:
const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return 0;
};
const TensorLayout&) override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&, const TensorLayout&, const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&) override {
return {};
}
const TensorLayout&, const TensorLayout&) override;
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, PreprocessedFilter*, const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet");
}

_megdnn_workspace) override;
const char* get_algorithm_set_name() const override; const char* get_algorithm_set_name() const override;


class AlgoBase; class AlgoBase;


+ 16
- 0
dnn/test/common/comparator.inl View File

@@ -32,6 +32,22 @@ class DefaultComparator<dt_float32> {
} }
}; };


template <>
class DefaultComparator<dt_qint8> {
public:
bool is_same(dt_qint8 expected, dt_qint8 actual) const {
return expected.as_int8() == actual.as_int8();
}
};

template <>
class DefaultComparator<dt_qint32> {
public:
bool is_same(dt_qint32 expected, dt_qint32 actual) const {
return expected.as_int32() == actual.as_int32();
}
};

} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn




+ 9
- 1
dnn/test/common/conv_bias.cpp View File

@@ -741,10 +741,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype,
std::unique_ptr<RNG> rng; std::unique_ptr<RNG> rng;
std::unique_ptr<RNG> bias_rng; std::unique_ptr<RNG> bias_rng;
std::unique_ptr<RNG> const_rng; std::unique_ptr<RNG> const_rng;
std::unique_ptr<RNG> zero_rng;
// TODO: check range of rng // TODO: check range of rng
if (src_dtype.enumv() == DTypeEnum::QuantizedS8) { if (src_dtype.enumv() == DTypeEnum::QuantizedS8) {
rng = std::make_unique<UniformIntRNG>(-3, 3); rng = std::make_unique<UniformIntRNG>(-3, 3);
const_rng = std::make_unique<UniformIntRNG>(1, 1); const_rng = std::make_unique<UniformIntRNG>(1, 1);
zero_rng = std::make_unique<UniformIntRNG>(0, 0);
megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32);
bias_rng = std::make_unique<UniformIntRNG>(-50, 50); bias_rng = std::make_unique<UniformIntRNG>(-50, 50);
checker.set_epsilon(1 + 1e-3) checker.set_epsilon(1 + 1e-3)
@@ -775,6 +777,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype,
fh = arg.filter[2]; fh = arg.filter[2];
fw = arg.filter[3]; fw = arg.filter[3];
z[1] = arg.filter[0] / 4; z[1] = arg.filter[0] / 4;
} else if (format == Format::NCHW32) {
hi = arg.src[2];
wi = arg.src[3];
fh = arg.filter[2];
fw = arg.filter[3];
z[1] = arg.filter[0] / 32;
} else { } else {
megdnn_assert(format == Format::CHWN4); megdnn_assert(format == Format::CHWN4);
hi = arg.src[1]; hi = arg.src[1];
@@ -798,7 +806,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype,
megdnn_assert(rng != nullptr && bias_rng != nullptr); megdnn_assert(rng != nullptr && bias_rng != nullptr);
checker.set_rng(0, rng.get()) checker.set_rng(0, rng.get())
.set_rng(1, rng.get()) .set_rng(1, rng.get())
.set_rng(2, rng.get())
.set_rng(2, bias_rng.get())
.set_rng(3, rng.get()); .set_rng(3, rng.get());
if (args.empty()) { if (args.empty()) {
std::vector<TestArg> default_args; std::vector<TestArg> default_args;


+ 6
- 5
dnn/test/common/tensor.inl View File

@@ -24,7 +24,8 @@ Tensor<T, C>::Tensor(Handle *handle, TensorLayout layout):
m_handle(handle), m_handle(handle),
m_comparator(C()) m_comparator(C())
{ {
layout.dtype = get_dtype_from_static_type<T>();
if (!layout.dtype.valid())
layout.dtype = get_dtype_from_static_type<T>();
m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte());
m_tensornd.layout = layout; m_tensornd.layout = layout;
} }
@@ -67,10 +68,10 @@ void Tensor<T, C>::check_with(const Tensor<T, C_> &rhs) const
auto index = Index(m_tensornd.layout, linear_idx); auto index = Index(m_tensornd.layout, linear_idx);
auto offset = index.positive_offset(); auto offset = index.positive_offset();
ASSERT_TRUE(m_comparator.is_same(p0[offset], p1[offset])) ASSERT_TRUE(m_comparator.is_same(p0[offset], p1[offset]))
<< "Index is " << index.to_string()
<< "; layout is " << m_tensornd.layout.to_string()
<< "; this->ptr()[offset] is " << this->ptr()[offset]
<< "; rhs.ptr()[offset] is " << rhs.ptr()[offset];
<< "Index is " << index.to_string() << "; layout is "
<< m_tensornd.layout.to_string() << "; this->ptr()[offset] is "
<< this->ptr()[offset] << "; rhs.ptr()[offset] is "
<< rhs.ptr()[offset];
} }
} }




+ 13
- 10
dnn/test/cuda/conv_bias_int8.cpp View File

@@ -18,6 +18,8 @@
#include "test/cuda/benchmark.h" #include "test/cuda/benchmark.h"
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
#include "test/cuda/utils.h" #include "test/cuda/utils.h"
#include "test/common/tensor.h"
#include "test/common/workspace_wrapper.h"


#define V1(x) #x #define V1(x) #x
#define V(x) V1(x) #define V(x) V1(x)
@@ -34,7 +36,6 @@ struct BenchArgs {
std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) { std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
std::vector<BenchArgs> args; std::vector<BenchArgs> args;
args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});

args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2}); args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
@@ -44,7 +45,6 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1}); args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2}); args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});


args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
@@ -57,6 +57,7 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1}); args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1}); args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1}); args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});


args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2}); args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2}); args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
@@ -331,6 +332,12 @@ void benchmark_target_algo_with_cudnn_tsc(
if ((format == Format::CHWN4 || format == Format::NCHW4) && if ((format == Format::CHWN4 || format == Format::NCHW4) &&
(arg.ci % 16 != 0)) (arg.ci % 16 != 0))
continue; continue;
Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
? Format::NCHW32
: Format::NCHW4;
param.format = format_cudnn;
benchmarker_cudnn.set_param(param);

float time_in_ms = 0.f; float time_in_ms = 0.f;
if (algo) { if (algo) {
time_in_ms = time_in_ms =
@@ -351,18 +358,14 @@ void benchmark_target_algo_with_cudnn_tsc(
{}}) / {}}) /
RUNS; RUNS;
} }
Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
? Format::NCHW32
: Format::NCHW4;
param.format = format_cudnn;
benchmarker_cudnn.set_param(param);
auto time_in_ms_cudnn =
float time_in_ms_cudnn =
benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn), benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
get_tensor_shape(filter, format_cudnn), get_tensor_shape(filter, format_cudnn),
get_tensor_shape(bias, format_cudnn), get_tensor_shape(bias, format_cudnn),
{}, {},
{}}) / {}}) /
RUNS; RUNS;

float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
(1e12); (1e12);
printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, " printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
@@ -1075,8 +1078,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {




#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
/// \note: we only check several cases and block sizes in megdnn_test, the full
/// testcases are written in cutlass repository
/// \note: we only check several cases and block sizes in megdnn_test, the
/// full testcases are written in cutlass repository
TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
require_compute_capability_eq(7, 5); require_compute_capability_eq(7, 5);
Checker<ConvBiasForward> checker(handle_cuda()); Checker<ConvBiasForward> checker(handle_cuda());


Loading…
Cancel
Save