|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035 |
- #include "megdnn/dtype.h"
- #include "megdnn/opr_param_defs.h"
- #include "megdnn/oprs.h"
- #include "megdnn/oprs/nn.h"
- #include "test/common/checker.h"
- #include "test/common/conv_bias.h"
- #include "test/common/rng.h"
- #include "test/common/tensor.h"
- #include "test/common/workspace_wrapper.h"
- #include "test/cuda/benchmark.h"
- #include "test/cuda/fixture.h"
- #include "test/cuda/utils.h"
-
- #include <cudnn.h>
- #include <gtest/gtest.h>
-
- #define V1(x) #x
- #define V(x) V1(x)
- #define CUDNN_VERSION_STRING \
- "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
-
- namespace megdnn {
- namespace test {
-
- TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
- Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
- auto opr = handle_cuda()->create_operator<ConvolutionForward>();
- for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
- auto run = [&checker, &dt, &opr](
- size_t n, size_t g, size_t h, size_t fh, size_t padding,
- size_t stride) {
- RegionRestrictedConvolution::Param cur_param;
- cur_param.mode =
- RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
- cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
- checker.set_dtype(2, dt).set_dtype(3, dt);
- float scale = 64.f / sqrt(fh * fh);
- UniformFloatRNG rng(scale, 2 * scale);
- UniformIntRNG r_rng{0, 2};
- checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
- 3, &r_rng);
-
- cur_param.pad_h = cur_param.pad_w = padding;
- cur_param.stride_h = cur_param.stride_w = stride;
-
- size_t ho = infer_conv_shape(h, fh, stride, padding);
-
- checker.set_param(cur_param).execs(
- {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
- };
- run(1, 1, 3, 2, 1, 1);
- run(1, 1, 5, 2, 1, 1);
- run(1, 1, 6, 2, 1, 1);
- run(1, 1, 7, 2, 1, 1);
- run(1, 1, 9, 2, 1, 1);
- run(1, 1, 10, 2, 1, 1);
- run(1, 1, 11, 2, 1, 1);
- run(1, 1, 13, 2, 1, 1);
- run(1, 1, 14, 2, 1, 1);
- run(1, 1, 15, 2, 1, 1);
- run(1, 1, 17, 2, 1, 1);
- run(1, 1, 18, 2, 1, 1);
- run(1, 1, 19, 2, 1, 1);
- run(1, 1, 21, 2, 1, 1);
- run(1, 1, 22, 2, 1, 1);
- run(1, 1, 23, 2, 1, 1);
- run(1, 1, 25, 2, 1, 1);
- run(1, 1, 26, 2, 1, 1);
- run(1, 1, 27, 2, 1, 1);
- run(1, 1, 29, 2, 1, 1);
- run(1, 1, 30, 2, 1, 1);
- run(1, 1, 31, 2, 1, 1);
- run(4, 8, 32, 3, 3 / 2, 1);
- run(4, 8, 32, 5, 5 / 2, 1);
- run(4, 8, 32, 7, 7 / 2, 1);
- run(4, 8, 32, 9, 9 / 2, 1);
- run(4, 8, 32, 11, 11 / 2, 1);
- run(4, 8, 32, 13, 13 / 2, 1);
- run(4, 8, 32, 15, 15 / 2, 1);
- run(4, 8, 32, 17, 17 / 2, 1);
- run(4, 8, 32, 19, 19 / 2, 1);
- run(4, 8, 32, 21, 21 / 2, 1);
- run(4, 8, 32, 23, 23 / 2, 1);
- run(4, 8, 32, 25, 25 / 2, 1);
- run(4, 8, 32, 27, 27 / 2, 1);
- run(4, 8, 32, 29, 29 / 2, 1);
- run(4, 8, 32, 31, 31 / 2, 1);
- run(4, 8, 31, 3, 3 / 2, 1);
- run(4, 8, 31, 5, 5 / 2, 1);
- run(4, 8, 31, 7, 7 / 2, 1);
- run(4, 8, 31, 9, 9 / 2, 1);
- run(4, 8, 31, 11, 11 / 2, 1);
- run(4, 8, 31, 13, 13 / 2, 1);
- run(4, 8, 31, 15, 15 / 2, 1);
- run(4, 8, 31, 17, 17 / 2, 1);
- run(4, 8, 31, 19, 19 / 2, 1);
- run(4, 8, 31, 21, 21 / 2, 1);
- run(4, 8, 31, 23, 23 / 2, 1);
- run(4, 8, 31, 25, 25 / 2, 1);
- run(4, 8, 31, 27, 27 / 2, 1);
- run(4, 8, 31, 29, 29 / 2, 1);
- run(4, 8, 31, 31, 31 / 2, 1);
- }
- }
-
- #if MEGDNN_WITH_BENCHMARK
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32_INT32) {
- require_compute_capability(7, 5);
- Benchmarker<ConvBiasForward> bencher(handle_cuda());
- bencher.set_display(false);
- bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
- ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
- "DEPTHWISE_LARGE_FILTER", {})
- .c_str()));
-
- Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvBias::Param param;
- param.format = ConvBias::Param::Format::NCHW;
- using NonlineMode = ConvBias::Param::NonlineMode;
- param.nonlineMode = NonlineMode::IDENTITY;
- param.sparse = ConvBias::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionForward::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{0, 2};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Int32())
- .set_dtype(3, dtype::Int32());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
- rout{batch, ho, wo}, out{batch, g, ho, wo};
-
- float bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
- auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
- "kern=%s, out=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- inp.to_string().c_str(), kern.to_string().c_str(),
- out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- }
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_INT32) {
- require_compute_capability(7, 5);
- Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
- bencher.set_display(false);
- bencher.set_before_exec_callback(
- AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
-
- Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvolutionBackwardData::Param param;
- param.format = ConvolutionBackwardData::Param::Format::NCHW;
- param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionBackwardData::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{1, 3};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Int32())
- .set_dtype(3, dtype::Int32());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
- rin{batch, hi, wi}, rout{batch, ho, wo},
- out{batch, g, ho, wo} /*output*/;
-
- float bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
- auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
- "grad=%s, "
- "kern=%s, diff=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- inp.to_string().c_str(), kern.to_string().c_str(),
- out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- }
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_UINT8) {
- require_compute_capability(7, 5);
- Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
- bencher.set_display(false);
- bencher.set_before_exec_callback(
- AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
-
- Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvolutionBackwardData::Param param;
- param.format = ConvolutionBackwardData::Param::Format::NCHW;
- param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionBackwardData::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{1, 3};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Uint8())
- .set_dtype(3, dtype::Uint8());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
- rin{batch, hi, wi}, rout{batch, ho, wo},
- out{batch, g, ho, wo} /*output*/;
-
- float bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
- auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
- "grad=%s, "
- "kern=%s, diff=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- inp.to_string().c_str(), kern.to_string().c_str(),
- out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000);
- }
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
- require_compute_capability(7, 5);
- Benchmarker<ConvBiasForward> bencher(handle_cuda());
- bencher.set_display(false);
- bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
- ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
- "DEPTHWISE_LARGE_FILTER", {})
- .c_str()));
-
- Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvBias::Param param;
- param.format = ConvBias::Param::Format::NCHW;
- using NonlineMode = ConvBias::Param::NonlineMode;
- param.nonlineMode = NonlineMode::IDENTITY;
- param.sparse = ConvBias::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionForward::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{0, 2};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Uint8())
- .set_dtype(3, dtype::Uint8());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
- rout{batch, ho, wo}, out{batch, g, ho, wo};
-
- float bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- inp.total_nr_elems() + kern.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- out.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
- auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
- "kern=%s, out=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- inp.to_string().c_str(), kern.to_string().c_str(),
- out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000);
- }
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
- require_compute_capability(7, 5);
-
- Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
-
- bencher.set_display(false);
- bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
- "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
-
- Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvolutionBackwardFilter::Param param;
- param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
- param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionBackwardFilter::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{1, 3};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.proxy()->target_execution_policy = {};
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Int32())
- .set_dtype(3, dtype::Int32());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
- rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
-
- float bandwith = static_cast<float>(
- src.total_nr_elems() + diff.total_nr_elems() +
- grad.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- src.total_nr_elems() + diff.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- grad.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
- auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
- "src=%s, "
- "diff=%s, grad=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- src.to_string().c_str(), diff.to_string().c_str(),
- grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- }
-
- TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
- require_compute_capability(7, 5);
-
- Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
-
- bencher.set_display(false);
- bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
- "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
-
- Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
- rr_bencher.set_display(false);
-
- ConvolutionBackwardFilter::Param param;
- param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
- param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
-
- RegionRestrictedConvolutionBackwardFilter::Param rr_param;
- rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
- rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
-
- UniformIntRNG r_rng{1, 3};
-
- auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
- size_t fw, size_t sh, size_t sw, size_t nr_times) {
- param.pad_h = fh / 2;
- param.pad_w = fw / 2;
- param.stride_h = sh;
- param.stride_w = sw;
-
- rr_param.pad_h = fh / 2;
- rr_param.pad_w = fw / 2;
- rr_param.stride_h = sh;
- rr_param.stride_w = sw;
-
- bencher.set_param(param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Float32())
- .set_dtype(4, dtype::Float32());
- bencher.proxy()->target_execution_policy = {};
- bencher.set_times(nr_times);
-
- rr_bencher.set_param(rr_param)
- .set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dtype::Uint8())
- .set_dtype(3, dtype::Uint8());
- rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
- rr_bencher.set_times(nr_times);
-
- size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
- size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
- TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
- rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
-
- float bandwith = static_cast<float>(
- src.total_nr_elems() + diff.total_nr_elems() +
- grad.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- float rr_bandwith = static_cast<float>(
- src.total_nr_elems() + diff.total_nr_elems() +
- rin.total_nr_elems() + rout.total_nr_elems() +
- grad.total_nr_elems()) /
- (1024 * 1024 * 1024) * 1e3;
-
- auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
- auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
-
- auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
- auto rr_ops =
- 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
- printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
- "src=%s, "
- "diff=%s, grad=%s\n"
- "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
- "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
- src.to_string().c_str(), diff.to_string().c_str(),
- grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
- bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
- time_in_ms / rr_time_in_ms);
- };
-
- run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
- run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
- }
-
- #endif
-
- TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
- Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
-
- for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
- auto run = [&checker, &dt](
- size_t n, size_t g, size_t ih, size_t fh, size_t padding,
- size_t stride) {
- RegionRestrictedConvolutionBackwardData::Param cur_param;
- cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
- CROSS_CORRELATION;
- cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
- ComputeMode::DEFAULT;
- cur_param.sparse =
- RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
- checker.set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dt)
- .set_dtype(3, dt);
- float scale = 64.f / sqrt(fh * fh);
- UniformFloatRNG rng(scale, 2 * scale);
- UniformIntRNG r_rng{1, 2};
- checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
- 3, &r_rng);
- cur_param.pad_h = cur_param.pad_w = padding;
- cur_param.stride_h = cur_param.stride_w = stride;
-
- size_t oh = (ih + 2 * padding - fh + 1) / stride;
- checker.set_param(cur_param).execs({
- {g, 1, 1, fh, fh}, // filter
- {n, g * 1, oh, oh}, // diff
- {n, ih, ih}, // rin
- {n, oh, oh}, // rout
- {n, g * 1, ih, ih} // grad
- });
- };
- run(1, 1, 3, 2, 1, 1);
- run(1, 1, 5, 2, 1, 1);
- run(1, 1, 6, 2, 1, 1);
- run(1, 1, 7, 2, 1, 1);
- run(1, 1, 9, 2, 1, 1);
- run(1, 1, 10, 2, 1, 1);
- run(1, 1, 11, 2, 1, 1);
- run(1, 1, 13, 2, 1, 1);
- run(1, 1, 14, 2, 1, 1);
- run(1, 1, 15, 2, 1, 1);
- run(1, 1, 17, 2, 1, 1);
- run(1, 1, 18, 2, 1, 1);
- run(1, 1, 19, 2, 1, 1);
- run(1, 1, 21, 2, 1, 1);
- run(1, 1, 22, 2, 1, 1);
- run(1, 1, 23, 2, 1, 1);
- run(1, 1, 25, 2, 1, 1);
- run(1, 1, 26, 2, 1, 1);
- run(1, 1, 27, 2, 1, 1);
- run(1, 1, 29, 2, 1, 1);
- run(1, 1, 30, 2, 1, 1);
- run(1, 1, 31, 2, 1, 1);
- run(4, 8, 32, 3, 3 / 2, 1);
- run(4, 8, 32, 5, 5 / 2, 1);
- run(4, 8, 32, 7, 7 / 2, 1);
- run(4, 8, 32, 9, 9 / 2, 1);
- run(4, 8, 32, 11, 11 / 2, 1);
- run(4, 8, 32, 13, 13 / 2, 1);
- run(4, 8, 32, 15, 15 / 2, 1);
- run(4, 8, 32, 17, 17 / 2, 1);
- run(4, 8, 32, 19, 19 / 2, 1);
- run(4, 8, 32, 21, 21 / 2, 1);
- run(4, 8, 32, 23, 23 / 2, 1);
- run(4, 8, 32, 25, 25 / 2, 1);
- run(4, 8, 32, 27, 27 / 2, 1);
- run(4, 8, 32, 29, 29 / 2, 1);
- run(4, 8, 32, 31, 31 / 2, 1);
- run(4, 8, 31, 3, 3 / 2, 1);
- run(4, 8, 31, 5, 5 / 2, 1);
- run(4, 8, 31, 7, 7 / 2, 1);
- run(4, 8, 31, 9, 9 / 2, 1);
- run(4, 8, 31, 11, 11 / 2, 1);
- run(4, 8, 31, 13, 13 / 2, 1);
- run(4, 8, 31, 15, 15 / 2, 1);
- run(4, 8, 31, 17, 17 / 2, 1);
- run(4, 8, 31, 19, 19 / 2, 1);
- run(4, 8, 31, 21, 21 / 2, 1);
- run(4, 8, 31, 23, 23 / 2, 1);
- run(4, 8, 31, 25, 25 / 2, 1);
- run(4, 8, 31, 27, 27 / 2, 1);
- run(4, 8, 31, 29, 29 / 2, 1);
- run(4, 8, 31, 31, 31 / 2, 1);
- }
- }
-
- TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
- Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
-
- for (auto dt : std::vector<DType>{dtype::Int32()}) {
- auto run = [&checker, &dt](
- size_t n, size_t g, size_t ih, size_t fh, size_t padding,
- size_t stride) {
- RegionRestrictedConvolutionBackwardData::Param cur_param;
- cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
- CROSS_CORRELATION;
- cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
- ComputeMode::DEFAULT;
- cur_param.sparse =
- RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
- checker.set_dtype(2, dt).set_dtype(3, dt);
- float scale = 64.f / sqrt(fh * fh);
- UniformFloatRNG rng(scale, 2 * scale);
- // value 0 mask may cause unexpected behaviour.
- UniformIntRNG r_rng{1, 1};
- checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
- 3, &r_rng);
- cur_param.pad_h = cur_param.pad_w = padding;
- cur_param.stride_h = cur_param.stride_w = stride;
-
- size_t oh = (ih + 2 * padding - fh + 1) / stride;
- checker.set_param(cur_param).execs(
- {/*filter*/ {g, 1, 1, fh, fh},
- /*diff*/ {n, g * 1, oh, oh},
- /*rin*/ {n, ih, ih},
- /*rout*/ {n, oh, oh},
- /*grad*/ {n, g * 1, ih, ih}});
- };
- run(1, 1, 3, 2, 1, 1);
- run(1, 1, 5, 2, 1, 1);
- run(1, 1, 6, 2, 1, 1);
- run(1, 1, 7, 2, 1, 1);
- run(1, 1, 9, 2, 1, 1);
- run(1, 1, 10, 2, 1, 1);
- run(1, 1, 11, 2, 1, 1);
- run(1, 1, 13, 2, 1, 1);
- run(1, 1, 14, 2, 1, 1);
- run(1, 1, 15, 2, 1, 1);
- run(1, 1, 17, 2, 1, 1);
- run(1, 1, 18, 2, 1, 1);
- run(1, 1, 19, 2, 1, 1);
- run(1, 1, 21, 2, 1, 1);
- run(1, 1, 22, 2, 1, 1);
- run(1, 1, 23, 2, 1, 1);
- run(1, 1, 25, 2, 1, 1);
- run(1, 1, 26, 2, 1, 1);
- run(1, 1, 27, 2, 1, 1);
- run(1, 1, 29, 2, 1, 1);
- run(1, 1, 30, 2, 1, 1);
- run(1, 1, 31, 2, 1, 1);
- run(4, 8, 32, 3, 3 / 2, 1);
- run(4, 8, 32, 5, 5 / 2, 1);
- run(4, 8, 32, 7, 7 / 2, 1);
- run(4, 8, 32, 9, 9 / 2, 1);
- run(4, 8, 32, 11, 11 / 2, 1);
- run(4, 8, 32, 13, 13 / 2, 1);
- run(4, 8, 32, 15, 15 / 2, 1);
- run(4, 8, 32, 17, 17 / 2, 1);
- run(4, 8, 32, 19, 19 / 2, 1);
- run(4, 8, 32, 21, 21 / 2, 1);
- run(4, 8, 32, 23, 23 / 2, 1);
- run(4, 8, 32, 25, 25 / 2, 1);
- run(4, 8, 32, 27, 27 / 2, 1);
- run(4, 8, 32, 29, 29 / 2, 1);
- run(4, 8, 32, 31, 31 / 2, 1);
- run(4, 8, 31, 3, 3 / 2, 1);
- run(4, 8, 31, 5, 5 / 2, 1);
- run(4, 8, 31, 7, 7 / 2, 1);
- run(4, 8, 31, 9, 9 / 2, 1);
- run(4, 8, 31, 11, 11 / 2, 1);
- run(4, 8, 31, 13, 13 / 2, 1);
- run(4, 8, 31, 15, 15 / 2, 1);
- run(4, 8, 31, 17, 17 / 2, 1);
- run(4, 8, 31, 19, 19 / 2, 1);
- run(4, 8, 31, 21, 21 / 2, 1);
- run(4, 8, 31, 23, 23 / 2, 1);
- run(4, 8, 31, 25, 25 / 2, 1);
- run(4, 8, 31, 27, 27 / 2, 1);
- run(4, 8, 31, 29, 29 / 2, 1);
- run(4, 8, 31, 31, 31 / 2, 1);
- }
- }
-
- TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
- require_compute_capability(6, 1);
- Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
-
- for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
- auto run = [&checker, &dt](
- size_t n, size_t g, size_t ih, size_t fh, size_t padding,
- size_t stride) {
- RegionRestrictedConvolutionBackwardFilter::Param cur_param;
- cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
- CROSS_CORRELATION;
- cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
- ComputeMode::DEFAULT;
- cur_param.sparse =
- RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
- checker.set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dt)
- .set_dtype(3, dt);
- float scale = 64.f / sqrt(fh * fh);
- UniformFloatRNG rng(scale, 2 * scale);
- UniformIntRNG r_rng{1, 2};
- checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
- 3, &r_rng);
- cur_param.pad_h = cur_param.pad_w = padding;
- cur_param.stride_h = cur_param.stride_w = stride;
-
- size_t oh = (ih + 2 * padding - fh + 1) / stride;
- checker.set_param(cur_param).execs({
- {n, g * 1, ih, ih}, // src
- {n, g * 1, oh, oh}, // diff
- {n, ih, ih}, // rin
- {n, oh, oh}, // rout
- {g, 1, 1, fh, fh} // grad
- });
- };
- run(4, 8, 32, 5, 5 / 2, 1);
- run(1, 2, 2, 2, 0, 1);
- run(1, 2, 3, 3, 0, 1);
- run(1, 2, 4, 4, 0, 1);
- run(1, 2, 5, 5, 0, 1);
- run(1, 2, 6, 6, 0, 1);
- run(1, 2, 7, 7, 0, 1);
- run(4, 8, 32, 7, 7 / 2, 1);
- run(4, 8, 32, 9, 9 / 2, 1);
- run(4, 8, 32, 11, 11 / 2, 1);
- run(4, 8, 32, 13, 13 / 2, 1);
- run(4, 8, 32, 15, 15 / 2, 1);
- run(4, 8, 32, 17, 17 / 2, 1);
- run(4, 8, 32, 19, 19 / 2, 1);
- run(4, 8, 32, 21, 21 / 2, 1);
- run(4, 8, 32, 23, 23 / 2, 1);
- run(4, 8, 32, 25, 25 / 2, 1);
- run(4, 8, 32, 27, 27 / 2, 1);
- run(4, 1, 32, 27, 27 / 2, 1);
- run(4, 8, 32, 29, 29 / 2, 1);
- run(4, 8, 32, 31, 31 / 2, 1);
- }
- }
-
- TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
- require_compute_capability(6, 1);
- Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
-
- for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
- auto run = [&checker, &dt](
- size_t n, size_t g, size_t ih, size_t fh, size_t padding,
- size_t stride) {
- RegionRestrictedConvolutionBackwardFilter::Param cur_param;
- cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
- CROSS_CORRELATION;
- cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
- ComputeMode::DEFAULT;
- cur_param.sparse =
- RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
- checker.set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_dtype(2, dt)
- .set_dtype(3, dt);
- float scale = 64.f / sqrt(fh * fh);
- UniformFloatRNG rng(scale, 2 * scale);
- UniformIntRNG r_rng{1, 1};
- checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
- 3, &r_rng);
- cur_param.pad_h = cur_param.pad_w = padding;
- cur_param.stride_h = cur_param.stride_w = stride;
-
- size_t oh = (ih + 2 * padding - fh + 1) / stride;
- checker.set_param(cur_param).execs({
- {n, g * 1, ih, ih}, // src
- {n, g * 1, oh, oh}, // diff
- {n, ih, ih}, // rin
- {n, oh, oh}, // rout
- {g, 1, 1, fh, fh} // grad
- });
- };
- run(4, 8, 32, 5, 5 / 2, 1);
- run(1, 2, 2, 2, 0, 1);
- run(1, 2, 3, 3, 0, 1);
- run(1, 2, 4, 4, 0, 1);
- run(1, 2, 5, 5, 0, 1);
- run(1, 2, 6, 6, 0, 1);
- run(1, 2, 7, 7, 0, 1);
- run(4, 8, 32, 7, 7 / 2, 1);
- run(4, 8, 32, 9, 9 / 2, 1);
- run(4, 8, 32, 11, 11 / 2, 1);
- run(4, 8, 32, 13, 13 / 2, 1);
- run(4, 8, 32, 15, 15 / 2, 1);
- run(4, 8, 32, 17, 17 / 2, 1);
- run(4, 8, 32, 19, 19 / 2, 1);
- run(4, 8, 32, 21, 21 / 2, 1);
- run(4, 1, 32, 21, 21 / 2, 1);
- run(4, 8, 32, 23, 23 / 2, 1);
- run(4, 8, 32, 25, 25 / 2, 1);
- run(4, 8, 32, 27, 27 / 2, 1);
- run(4, 8, 32, 29, 29 / 2, 1);
- run(4, 8, 32, 31, 31 / 2, 1);
- }
- }
- } // namespace test
- } // namespace megdnn
-
- // vim: syntax=cpp.doxygen
|