You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

pooling.cpp 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. #include "test/cuda/fixture.h"
  2. #include "megdnn/tensor_iter.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/pooling.h"
  5. #include "src/common/utils.h"
  6. #include "test/cuda/utils.h"
  7. // to check cudnn version
  8. #include <cudnn.h>
  9. #include "test/cuda/benchmark.h"
  10. namespace {
  11. #define V1(v) #v
  12. #define V(v) V1(v)
  13. #define DEF_NAME(NAME) \
  14. #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  15. } // namespace
  16. namespace megdnn {
  17. namespace test {
  18. TEST_F(CUDA, POOLING_FORWARD) {
  19. auto args = pooling::get_args();
  20. using Format = param::Pooling::Format;
  21. std::vector<DType> dtypes{dtype::Float16(), dtype::BFloat16(), dtype::Float32()};
  22. if (check_compute_capability(6, 0)) {
  23. // int pooling is supported only for Pascal or higher
  24. dtypes.push_back(dtype::Int8());
  25. }
  26. for (auto dtype : dtypes)
  27. for (auto format : {Format::NCHW, Format::NHWC})
  28. for (auto&& arg : args) {
  29. auto param = arg.param;
  30. auto src = arg.ishape;
  31. param.format = format;
  32. if (param.format == Format::NHWC) {
  33. src = cvt_src_or_dst_nchw2nhwc(src);
  34. }
  35. Checker<Pooling> checker(handle_cuda());
  36. if (dtype == dtype::Int8()) {
  37. // different versions of cuDNN differs in rounding behavior;
  38. // setting eps to 1 to allow for rounding errors.
  39. checker.set_epsilon(1 + 1e-3);
  40. } else if (dtype == dtype::BFloat16()) {
  41. checker.set_epsilon(2e-2);
  42. } else {
  43. checker.set_epsilon(1e-2);
  44. }
  45. checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
  46. TensorShapeArray{src, {}});
  47. }
  48. /* add test for new Mode temporarily */
  49. for (auto dtype : dtypes)
  50. for (auto format : {Format::NCHW, Format::NHWC})
  51. for (auto&& arg : args) {
  52. auto param = arg.param;
  53. if (param.mode == Pooling::Mode::AVERAGE)
  54. param.mode = Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  55. else
  56. continue;
  57. auto src = arg.ishape;
  58. param.format = format;
  59. if (param.format == Format::NHWC) {
  60. src = cvt_src_or_dst_nchw2nhwc(src);
  61. }
  62. Checker<Pooling> checker(handle_cuda());
  63. if (dtype == dtype::Int8()) {
  64. // different versions of cuDNN differs in rounding behavior;
  65. // setting eps to 1 to allow for rounding errors.
  66. checker.set_epsilon(1 + 1e-3);
  67. } else if (dtype == dtype::BFloat16()) {
  68. checker.set_epsilon(2e-2);
  69. } else {
  70. checker.set_epsilon(1e-2);
  71. }
  72. checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
  73. TensorShapeArray{src, {}});
  74. }
  75. }
  76. TEST_F(CUDA, POOLING_BACKWARD) {
  77. auto args = pooling::get_args();
  78. for (auto&& arg : args) {
  79. Checker<PoolingBackward> checker(handle_cuda());
  80. TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
  81. TensorLayout olayout;
  82. auto constraint = [this, arg](CheckerHelper::TensorValueArray& tensors_orig) {
  83. megdnn_assert(tensors_orig.size() == 4);
  84. auto opr = handle_cuda()->create_operator<PoolingForward>();
  85. opr->param() = arg.param;
  86. auto tensors_cuda_storage = CheckerHelper::alloc_tensors(
  87. handle_cuda(), {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
  88. auto&& tensors_cuda = *tensors_cuda_storage;
  89. auto span = tensors_cuda[0].layout.span();
  90. auto dst = static_cast<dt_byte*>(tensors_cuda[0].raw_ptr()) + span.low_byte;
  91. auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr()) +
  92. span.low_byte;
  93. megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte());
  94. auto workspace_size = opr->get_workspace_in_bytes(
  95. tensors_cuda[0].layout, tensors_cuda[1].layout);
  96. auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size);
  97. Workspace workspace{static_cast<dt_byte*>(workspace_cuda), workspace_size};
  98. opr->exec(tensors_cuda[0], tensors_cuda[1], workspace);
  99. megdnn_free(handle_cuda(), workspace_cuda);
  100. span = tensors_cuda[1].layout.span();
  101. dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr()) + span.low_byte;
  102. src = static_cast<const dt_byte*>(tensors_cuda[1].raw_ptr()) +
  103. span.low_byte;
  104. megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte());
  105. };
  106. {
  107. auto opr = handle_cuda()->create_operator<PoolingForward>();
  108. opr->param() = arg.param;
  109. opr->deduce_layout(ilayout, olayout);
  110. }
  111. auto set_dtype = [&checker](DType dtype) {
  112. checker.set_dtype(0, dtype)
  113. .set_dtype(1, dtype)
  114. .set_dtype(2, dtype)
  115. .set_dtype(3, dtype);
  116. };
  117. checker.set_tensors_constraint(constraint);
  118. set_dtype(dtype::Float32());
  119. checker.set_param(arg.param).exec(
  120. TensorShapeArray{ilayout, olayout, olayout, ilayout});
  121. Float16PeriodicalRNG rng;
  122. set_dtype(dtype::Float16());
  123. checker.set_param(arg.param).set_rng(0, &rng).set_epsilon(1e-2).exec(
  124. TensorShapeArray{ilayout, olayout, olayout, ilayout});
  125. BFloat16PeriodicalRNG bf16_rng;
  126. set_dtype(dtype::BFloat16());
  127. checker.set_param(arg.param)
  128. .set_rng(0, &bf16_rng)
  129. .set_epsilon(1e-2)
  130. .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
  131. }
  132. /* add test for new Mode temporarily */
  133. for (auto&& arg : args) {
  134. if (arg.param.mode == Pooling::Mode::AVERAGE)
  135. arg.param.mode = Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  136. else
  137. continue;
  138. Checker<PoolingBackward> checker(handle_cuda());
  139. TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
  140. TensorLayout olayout;
  141. auto constraint = [this, arg](CheckerHelper::TensorValueArray& tensors_orig) {
  142. megdnn_assert(tensors_orig.size() == 4);
  143. auto opr = handle_cuda()->create_operator<PoolingForward>();
  144. opr->param() = arg.param;
  145. auto tensors_cuda_storage = CheckerHelper::alloc_tensors(
  146. handle_cuda(), {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
  147. auto&& tensors_cuda = *tensors_cuda_storage;
  148. auto span = tensors_cuda[0].layout.span();
  149. auto dst = static_cast<dt_byte*>(tensors_cuda[0].raw_ptr()) + span.low_byte;
  150. auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr()) +
  151. span.low_byte;
  152. megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte());
  153. auto workspace_size = opr->get_workspace_in_bytes(
  154. tensors_cuda[0].layout, tensors_cuda[1].layout);
  155. auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size);
  156. Workspace workspace{static_cast<dt_byte*>(workspace_cuda), workspace_size};
  157. opr->exec(tensors_cuda[0], tensors_cuda[1], workspace);
  158. megdnn_free(handle_cuda(), workspace_cuda);
  159. span = tensors_cuda[1].layout.span();
  160. dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr()) + span.low_byte;
  161. src = static_cast<const dt_byte*>(tensors_cuda[1].raw_ptr()) +
  162. span.low_byte;
  163. megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte());
  164. };
  165. {
  166. auto opr = handle_cuda()->create_operator<PoolingForward>();
  167. opr->param() = arg.param;
  168. opr->deduce_layout(ilayout, olayout);
  169. }
  170. auto set_dtype = [&checker](DType dtype) {
  171. checker.set_dtype(0, dtype)
  172. .set_dtype(1, dtype)
  173. .set_dtype(2, dtype)
  174. .set_dtype(3, dtype);
  175. };
  176. checker.set_tensors_constraint(constraint);
  177. set_dtype(dtype::Float32());
  178. checker.set_param(arg.param).exec(
  179. TensorShapeArray{ilayout, olayout, olayout, ilayout});
  180. Float16PeriodicalRNG rng;
  181. set_dtype(dtype::Float16());
  182. checker.set_param(arg.param).set_rng(0, &rng).set_epsilon(1e-2).exec(
  183. TensorShapeArray{ilayout, olayout, olayout, ilayout});
  184. BFloat16PeriodicalRNG bf16_rng;
  185. set_dtype(dtype::BFloat16());
  186. checker.set_param(arg.param)
  187. .set_rng(0, &bf16_rng)
  188. .set_epsilon(1e-2)
  189. .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
  190. }
  191. }
  192. TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
  193. require_compute_capability(7, 5);
  194. using Param = param::Pooling;
  195. Checker<Pooling> checker(handle_cuda());
  196. Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2};
  197. checker.set_dtype(0, dtype::QuantizedS4(3.1415926f));
  198. param.format = Param::Format::NCHW;
  199. checker.set_param(param).exec({{20, 64, 22, 33}, {}});
  200. param.mode = Param::Mode::AVERAGE;
  201. checker.set_param(param).exec({{20, 96, 22, 33}, {}});
  202. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  203. checker.set_param(param).exec({{20, 24, 22, 33}, {}});
  204. checker.set_dtype(0, dtype::Quantized4Asymm(3.1415926f, 3));
  205. param.format = Param::Format::NCHW;
  206. checker.set_param(param).exec({{20, 64, 22, 33}, {}});
  207. param.mode = Param::Mode::AVERAGE;
  208. checker.set_param(param).exec({{20, 96, 22, 33}, {}});
  209. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  210. checker.set_param(param).exec({{20, 24, 22, 33}, {}});
  211. }
  212. TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) {
  213. require_compute_capability(7, 5);
  214. using Param = param::Pooling;
  215. Checker<Pooling> checker(handle_cuda());
  216. Param param;
  217. checker.set_dtype(0, dtype::QuantizedS8(0.1f));
  218. checker.set_epsilon(1 + 1e-3);
  219. checker.set_before_exec_callback(
  220. AlgoChecker<PoolingForward>(DEF_NAME(cudnnForward)));
  221. for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) {
  222. param.format = format;
  223. param.mode = Param::Mode::MAX;
  224. checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
  225. param.mode = Param::Mode::AVERAGE;
  226. checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}});
  227. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  228. checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
  229. }
  230. }
  231. #if CUDNN_VERSION >= 7500
  232. TEST_F(CUDA, POOLING_FORWARD_NCHW32) {
  233. require_compute_capability(7, 5);
  234. using Param = param::Pooling;
  235. Checker<Pooling> checker(handle_cuda());
  236. Param param;
  237. auto i8_min = std::numeric_limits<int8_t>().min();
  238. auto i8_max = std::numeric_limits<int8_t>().max();
  239. UniformIntRNG int_rng{i8_min, i8_max};
  240. checker.set_dtype(0, dtype::QuantizedS8(0.1f));
  241. checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW32"));
  242. param.format = Param::Format::NCHW32;
  243. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  244. checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}});
  245. param.mode = Param::Mode::AVERAGE;
  246. checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
  247. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  248. checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
  249. }
  250. #endif
  251. TEST_F(CUDA, POOLING_FORWARD_NCHW64_Q4) {
  252. require_compute_capability(7, 5);
  253. using Param = param::Pooling;
  254. Checker<Pooling> checker(handle_cuda());
  255. Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2};
  256. UniformIntRNG int_rng{-8, 7};
  257. checker.set_dtype(0, dtype::QuantizedS4(1.f));
  258. param.format = Param::Format::NCHW64;
  259. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  260. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  261. param.mode = Param::Mode::AVERAGE;
  262. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  263. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  264. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  265. }
  266. TEST_F(CUDA, POOLING_FORWARD_NCHW64_U4) {
  267. require_compute_capability(7, 5);
  268. using Param = param::Pooling;
  269. Checker<Pooling> checker(handle_cuda());
  270. Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2};
  271. UniformIntRNG int_rng{0, 15};
  272. checker.set_dtype(0, dtype::Quantized4Asymm(1.f, 3));
  273. param.format = Param::Format::NCHW64;
  274. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  275. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  276. param.mode = Param::Mode::AVERAGE;
  277. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  278. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  279. checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}});
  280. }
  281. TEST_F(CUDA, POOLING_FORWARD_NHWC_Q4) {
  282. require_compute_capability(7, 5);
  283. using Param = param::Pooling;
  284. Checker<Pooling> checker(handle_cuda());
  285. Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2};
  286. UniformIntRNG int_rng{-8, 7};
  287. checker.set_dtype(0, dtype::QuantizedS4(1.f));
  288. param.format = Param::Format::NHWC;
  289. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  290. checker.set_param(param).exec({{2, 28, 28, 16}, {}});
  291. checker.set_param(param).exec({{2, 177, 233, 16}, {}});
  292. param.mode = Param::Mode::AVERAGE;
  293. checker.set_param(param).exec({{3, 13, 28, 32}, {}});
  294. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  295. checker.set_param(param).exec({{4, 29, 28, 64}, {}});
  296. }
  297. TEST_F(CUDA, POOLING_FORWARD_NHWC_U4) {
  298. require_compute_capability(7, 5);
  299. using Param = param::Pooling;
  300. Checker<Pooling> checker(handle_cuda());
  301. Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2};
  302. UniformIntRNG int_rng{0, 15};
  303. checker.set_dtype(0, dtype::Quantized4Asymm(1.f, 3));
  304. param.format = Param::Format::NHWC;
  305. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  306. checker.set_param(param).exec({{2, 28, 28, 16}, {}});
  307. checker.set_param(param).exec({{2, 177, 233, 16}, {}});
  308. param.mode = Param::Mode::AVERAGE;
  309. checker.set_param(param).exec({{3, 13, 28, 32}, {}});
  310. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  311. checker.set_param(param).exec({{4, 29, 28, 64}, {}});
  312. }
  313. TEST_F(CUDA, POOLING_FORWARD_CHWN4) {
  314. require_compute_capability(6, 1);
  315. using Param = param::Pooling;
  316. Checker<Pooling> checker(handle_cuda());
  317. Param param;
  318. auto i8_min = std::numeric_limits<int8_t>().min();
  319. auto i8_max = std::numeric_limits<int8_t>().max();
  320. UniformIntRNG int_rng{i8_min, i8_max};
  321. checker.set_dtype(0, dtype::QuantizedS8(0.1f));
  322. param.format = Param::Format::CHWN4;
  323. for (auto mode :
  324. {Param::Mode::MAX, Param::Mode::AVERAGE,
  325. Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
  326. param.mode = mode;
  327. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  328. checker.set_param(param).exec({{8, 28, 28, 64, 4}, {}});
  329. checker.set_param(param).exec({{8, 28, 28, 15, 4}, {}});
  330. checker.set_param(param).exec({{8, 28, 28, 30, 4}, {}});
  331. }
  332. }
  333. TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) {
  334. require_compute_capability(6, 1);
  335. using Param = param::Pooling;
  336. Checker<Pooling> checker(handle_cuda());
  337. Param param;
  338. auto i8_min = std::numeric_limits<int8_t>().min();
  339. auto i8_max = std::numeric_limits<int8_t>().max();
  340. UniformIntRNG int_rng{i8_min, i8_max};
  341. checker.set_dtype(0, dtype::QuantizedS8(0.1f));
  342. param.format = Param::Format::NCHW4;
  343. checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW4"));
  344. for (auto mode :
  345. {Param::Mode::MAX, Param::Mode::AVERAGE,
  346. Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
  347. param.mode = mode;
  348. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  349. checker.set_param(param).exec({{64, 8, 28, 28, 4}, {}});
  350. checker.set_param(param).exec({{15, 8, 28, 28, 4}, {}});
  351. checker.set_param(param).exec({{30, 8, 28, 28, 4}, {}});
  352. }
  353. }
  354. TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) {
  355. require_compute_capability(6, 1);
  356. using Param = param::Pooling;
  357. Checker<Pooling> checker(handle_cuda());
  358. Param param;
  359. auto i8_min = std::numeric_limits<int8_t>().min();
  360. auto i8_max = std::numeric_limits<int8_t>().max();
  361. UniformIntRNG int_rng{i8_min, i8_max};
  362. checker.set_dtype(0, dtype::QuantizedS8(0.1f));
  363. checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW32"));
  364. param.format = Param::Format::NCHW32;
  365. for (auto mode :
  366. {Param::Mode::MAX, Param::Mode::AVERAGE,
  367. Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
  368. param.mode = mode;
  369. checker.set_epsilon(1e-3).set_rng(0, &int_rng);
  370. checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}});
  371. checker.set_param(param).exec({{15, 8, 28, 28, 32}, {}});
  372. checker.set_param(param).exec({{30, 8, 28, 28, 32}, {}});
  373. }
  374. }
  375. #if MEGDNN_WITH_BENCHMARK
  376. TEST_F(CUDA, BENCHMARK_POOLING_CHWN4) {
  377. CUBenchmarker<Pooling> bencher(handle_cuda());
  378. size_t nr_times = 1000;
  379. bencher.set_times(nr_times);
  380. using Param = param::Pooling;
  381. Param param;
  382. auto run_bench = [&](size_t N, size_t C, size_t H, size_t W, size_t stride,
  383. size_t padding, size_t window,
  384. Param::Mode mode = Param::Mode::MAX) {
  385. param.mode = mode;
  386. param.pad_h = param.pad_w = padding;
  387. param.window_h = param.window_w = window;
  388. param.stride_h = param.stride_w = stride;
  389. param.format = Param::Format::NCHW4;
  390. bencher.set_dtype(0, dtype::QuantizedS8{0.1f});
  391. bencher.set_param(param);
  392. auto time_cudnn = bencher.execs({{N, C / 4, H, W, 4}, {}}) / nr_times;
  393. param.format = Param::Format::CHWN4;
  394. bencher.set_param(param);
  395. auto time_chwn4 = bencher.execs({{C / 4, H, W, N, 4}, {}}) / nr_times;
  396. auto time_nchw32 = bencher.execs({{N, C / 32, H, W, 32}, {}}) / nr_times;
  397. size_t oh = infer_conv_shape(H, window, stride, padding),
  398. ow = infer_conv_shape(W, window, stride, padding);
  399. float io = (N * C * H * W + N * C * oh * ow) * sizeof(int8_t);
  400. printf("time(cudnn)=%.2f ms, time(chwn4)=%.2f ms, time(nchw32)=%.2f "
  401. "ms, "
  402. "bandwidth(cudnn)=%.2f Gb/s, bandwidth(chwn4)=%.2f Gb/s, "
  403. "bandwidth(nchw32)=%.2f Gb/s\n",
  404. time_cudnn, time_chwn4, time_nchw32, io / (1e6 * time_cudnn),
  405. io / (1e6 * time_chwn4), io / (1e6 * time_nchw32));
  406. };
  407. run_bench(64, 64, 112, 112, 2, 1, 2);
  408. run_bench(256, 64, 112, 112, 2, 1, 2);
  409. run_bench(64, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE);
  410. run_bench(256, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE);
  411. run_bench(64, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING);
  412. run_bench(256, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING);
  413. }
  414. #endif
  415. } // namespace test
  416. } // namespace megdnn
  417. // vim: syntax=cpp.doxygen