You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 16 kB


  1. /**
  2. * \file dnn/test/fallback/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/fallback/fixture.h"
  12. #include <ctime>
  13. #include "test/common/checker.h"
  14. #include "test/common/elemwise.h"
  15. #include "test/common/task_record_check.h"
  16. #include "test/common/tensor.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. template <typename tag>
  20. class FALLBACK_ELEMWISE : public FALLBACK {};
  21. TYPED_TEST_CASE(FALLBACK_ELEMWISE, elemwise::test_types);
  22. TYPED_TEST(FALLBACK_ELEMWISE, run) {
  23. elemwise::run_test<TypeParam>(this->handle());
  24. }
  25. TEST_F(FALLBACK, ELEMWISE_RECORD) {
  26. TaskRecordChecker<Elemwise> checker{1};
  27. checker.set_param({Elemwise::Mode::ADD});
  28. checker.set_dtype(0, dtype::Float32());
  29. checker.set_dtype(1, dtype::Float32());
  30. checker.set_dtype(2, dtype::Float32());
  31. UniformIntRNG rng{-100, 100};
  32. checker.set_rng(0, &rng);
  33. checker.set_rng(1, &rng);
  34. checker.set_rng(2, &rng);
  35. checker.execs({{10, 10, 32}, {10, 10, 32}, {}});
  36. }
  37. TEST_F(FALLBACK, ELEMWISE_FORWARD_TERNARY) {
  38. using Mode = ElemwiseForward::Param::Mode;
  39. Checker<ElemwiseForward> checker(handle());
  40. checker.set_param(Mode::FUSE_MUL_ADD3);
  41. auto run = [&] {
  42. //! nchw44
  43. checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  44. checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  45. checker.execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  46. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  47. checker.execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  48. //! nchw44
  49. checker.execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  50. checker.execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  51. checker.execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  52. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  53. checker.execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  54. //! nchw88
  55. checker.execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  56. checker.execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  57. checker.execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  58. checker.execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  59. checker.execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  60. //! nchw88
  61. checker.execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  62. checker.execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  63. checker.execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  64. checker.execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  65. checker.execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  66. checker.execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}});
  67. checker.execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}});
  68. checker.execs({{1, 4, 1}, {3, 4, 7}, {1, 4, 1}, {}});
  69. checker.execs({{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}, {}});
  70. checker.execs({{1, 7}, {1, 7}, {1, 7}, {}});
  71. checker.execs({{1, 2, 1}, {1, 2, 2}, {1, 2, 1}, {}});
  72. checker.execs({{1, 2, 2}, {1, 2, 2}, {1, 1, 1}, {}});
  73. checker.execs({{3, 4, 1}, {3, 4, 1}, {3, 4, 1}, {}});
  74. checker.execs({{3, 4, 5}, {1}, {1}, {}});
  75. checker.execs({{1}, {3, 4, 5}, {1}, {}});
  76. };
  77. // case int
  78. checker.set_dtype(0, dtype::Int8());
  79. checker.set_dtype(1, dtype::Int8());
  80. checker.set_dtype(2, dtype::Int8());
  81. run();
  82. checker.set_dtype(0, dtype::Int16());
  83. checker.set_dtype(1, dtype::Int16());
  84. checker.set_dtype(2, dtype::Int16());
  85. run();
  86. checker.set_dtype(0, dtype::Int32());
  87. checker.set_dtype(1, dtype::Int32());
  88. checker.set_dtype(2, dtype::Int32());
  89. run();
  90. // case float
  91. UniformFloatRNG rng(1e-5, 7e1);
  92. checker.set_rng(0, &rng);
  93. checker.set_epsilon(1e-5);
  94. checker.set_dtype(0, dtype::Float32());
  95. checker.set_dtype(1, dtype::Float32());
  96. checker.set_dtype(2, dtype::Float32());
  97. run();
  98. }
  99. TEST_F(FALLBACK, ELEMWISE_FORWARD_NCHW44_INT8_INT16_INT32) {
  100. using Mode = ElemwiseForward::Param::Mode;
  101. Checker<ElemwiseForward> checker(handle());
  102. auto run = [&]() {
  103. // VEC_BCAST101x not PowOp
  104. checker.set_param(Mode::ADD).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  105. checker.set_param(Mode::ADD).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  106. checker.set_param(Mode::ADD).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  107. checker.set_param(Mode::ADD).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  108. checker.set_param(Mode::ADD).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  109. checker.set_param(Mode::RMULH).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  110. checker.set_param(Mode::RMULH).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  111. checker.set_param(Mode::RMULH).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  112. checker.set_param(Mode::RMULH).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  113. checker.set_param(Mode::RMULH).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  114. checker.set_param(Mode::FUSE_ADD_RELU)
  115. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  116. checker.set_param(Mode::FUSE_ADD_RELU)
  117. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  118. checker.set_param(Mode::FUSE_ADD_RELU)
  119. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  120. checker.set_param(Mode::FUSE_ADD_RELU)
  121. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  122. checker.set_param(Mode::FUSE_ADD_RELU)
  123. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  124. // BCAST101x_VEC not PowOp
  125. checker.set_param(Mode::ADD).execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  126. checker.set_param(Mode::ADD).execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  127. checker.set_param(Mode::ADD).execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  128. checker.set_param(Mode::ADD).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  129. checker.set_param(Mode::ADD).execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  130. checker.set_param(Mode::FUSE_ADD_RELU)
  131. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  132. checker.set_param(Mode::FUSE_ADD_RELU)
  133. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  134. checker.set_param(Mode::FUSE_ADD_RELU)
  135. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  136. checker.set_param(Mode::FUSE_ADD_RELU)
  137. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  138. checker.set_param(Mode::FUSE_ADD_RELU)
  139. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  140. };
  141. checker.set_dtype(0, dtype::Int8());
  142. checker.set_dtype(1, dtype::Int8());
  143. run();
  144. checker.set_dtype(0, dtype::Int16());
  145. checker.set_dtype(1, dtype::Int16());
  146. run();
  147. checker.set_dtype(0, dtype::Int32());
  148. checker.set_dtype(1, dtype::Int32());
  149. run();
  150. }
  151. TEST_F(FALLBACK, ELEMWISE_FORWARD_NCHW44_FP32) {
  152. using Mode = ElemwiseForward::Param::Mode;
  153. Checker<ElemwiseForward> checker(handle());
  154. UniformFloatRNG rng(1e-5, 7e1);
  155. checker.set_rng(0, &rng);
  156. checker.set_epsilon(1e-5);
  157. checker.set_dtype(0, dtype::Float32());
  158. checker.set_dtype(1, dtype::Float32());
  159. checker.set_param(Mode::FUSE_ADD_RELU)
  160. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  161. checker.set_param(Mode::FUSE_ADD_RELU)
  162. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  163. checker.set_param(Mode::FUSE_ADD_RELU)
  164. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  165. checker.set_param(Mode::FUSE_ADD_RELU)
  166. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  167. checker.set_param(Mode::FUSE_ADD_RELU)
  168. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  169. checker.set_param(Mode::FUSE_ADD_RELU)
  170. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  171. checker.set_param(Mode::FUSE_ADD_RELU)
  172. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  173. checker.set_param(Mode::FUSE_ADD_RELU)
  174. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  175. checker.set_param(Mode::FUSE_ADD_RELU)
  176. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  177. checker.set_param(Mode::FUSE_ADD_RELU)
  178. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  179. auto run = [&](Mode mode) {
  180. // VEC_BCAST101x
  181. checker.set_param(mode).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  182. checker.set_param(mode).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  183. checker.set_param(mode).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  184. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  185. checker.set_param(mode).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  186. // BCAST101x_VEC not powOp
  187. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  188. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  189. checker.set_param(mode).execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  190. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  191. checker.set_param(mode).execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  192. };
  193. run(Mode::ADD);
  194. run(Mode::FUSE_ADD_H_SWISH);
  195. run(Mode::FUSE_ADD_RELU);
  196. run(Mode::MAX);
  197. run(Mode::MIN);
  198. run(Mode::MUL);
  199. run(Mode::SUB);
  200. run(Mode::TRUE_DIV);
  201. run(Mode::POW);
  202. }
  203. TEST_F(FALLBACK, ELEMWISE_FORWARD_NCHW88_FP) {
  204. using Mode = ElemwiseForward::Param::Mode;
  205. Checker<ElemwiseForward> checker(handle());
  206. checker.set_param(Mode::FUSE_ADD_RELU)
  207. .execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  208. checker.set_param(Mode::FUSE_ADD_RELU)
  209. .execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  210. checker.set_param(Mode::FUSE_ADD_RELU)
  211. .execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  212. checker.set_param(Mode::FUSE_ADD_RELU)
  213. .execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  214. checker.set_param(Mode::FUSE_ADD_RELU)
  215. .execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  216. checker.set_param(Mode::FUSE_ADD_RELU)
  217. .execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  218. checker.set_param(Mode::FUSE_ADD_RELU)
  219. .execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  220. checker.set_param(Mode::FUSE_ADD_RELU)
  221. .execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  222. checker.set_param(Mode::FUSE_ADD_RELU)
  223. .execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  224. checker.set_param(Mode::FUSE_ADD_RELU)
  225. .execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  226. auto run = [&](Mode mode) {
  227. // VEC_BCAST101x
  228. checker.set_param(mode).execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  229. checker.set_param(mode).execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  230. checker.set_param(mode).execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  231. checker.set_param(mode).execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  232. checker.set_param(mode).execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  233. // BCAST101x_VEC not powOp
  234. checker.set_param(mode).execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  235. checker.set_param(mode).execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  236. checker.set_param(mode).execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  237. checker.set_param(mode).execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  238. checker.set_param(mode).execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  239. };
  240. auto run_all = [&]() {
  241. run(Mode::ADD);
  242. run(Mode::FUSE_ADD_H_SWISH);
  243. run(Mode::FUSE_ADD_RELU);
  244. run(Mode::MAX);
  245. run(Mode::MIN);
  246. run(Mode::MUL);
  247. run(Mode::SUB);
  248. run(Mode::TRUE_DIV);
  249. run(Mode::POW);
  250. };
  251. {
  252. UniformFloatRNG rng(1e-5, 7e1);
  253. checker.set_rng(0, &rng);
  254. checker.set_epsilon(1e-5);
  255. checker.set_dtype(0, dtype::Float32());
  256. checker.set_dtype(1, dtype::Float32());
  257. run_all();
  258. }
  259. }
  260. TEST_F(FALLBACK, ELEMWISE_FORWARD_N1HW_FP32_BCAST) {
  261. using Mode = ElemwiseForward::Param::Mode;
  262. Checker<ElemwiseForward> checker(handle());
  263. UniformFloatRNG rng(1e-5, 7e1);
  264. checker.set_rng(0, &rng);
  265. checker.set_epsilon(1e-5);
  266. checker.set_dtype(0, dtype::Float32());
  267. checker.set_dtype(1, dtype::Float32());
  268. //! 2 dim
  269. auto run = [&](Mode mode) {
  270. // VEC_BCASTX0X
  271. checker.set_param(mode).execs({{2, 8, 4, 4}, {2, 1, 4, 4}, {}});
  272. checker.set_param(mode).execs({{4, 21, 78}, {4, 1, 78}, {}});
  273. // BCASTX0X_VEC
  274. checker.set_param(mode).execs({{2, 1, 4, 4}, {2, 8, 4, 4}, {}});
  275. checker.set_param(mode).execs({{4, 1, 78}, {4, 21, 78}, {}});
  276. };
  277. run(Mode::ADD);
  278. run(Mode::MUL);
  279. run(Mode::SUB);
  280. }
  281. TEST_F(FALLBACK, ELEMWISE_FORWARD_TERNARY_RECORD) {
  282. using Mode = ElemwiseForward::Param::Mode;
  283. TaskRecordChecker<ElemwiseForward> checker(0);
  284. checker.set_param(Mode::FUSE_MUL_ADD3);
  285. auto run = [&] {
  286. //! nchw44
  287. checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  288. checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  289. //! nchw88
  290. checker.execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  291. checker.execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  292. checker.execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}});
  293. checker.execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}});
  294. };
  295. // case int
  296. checker.set_dtype(0, dtype::Int32());
  297. checker.set_dtype(1, dtype::Int32());
  298. checker.set_dtype(2, dtype::Int32());
  299. run();
  300. // case float
  301. UniformFloatRNG rng(1e-5, 7e1);
  302. checker.set_rng(0, &rng);
  303. checker.set_epsilon(1e-5);
  304. checker.set_dtype(0, dtype::Float32());
  305. checker.set_dtype(1, dtype::Float32());
  306. checker.set_dtype(2, dtype::Float32());
  307. run();
  308. }
  309. #if MEGDNN_WITH_BENCHMARK
  310. TEST_F(FALLBACK, BENCHMARK_ELEMWISE) {
  311. auto naive_handle = create_cpu_handle(2);
  312. auto run = [&](const TensorShape& shp0, const TensorShape& shp1) {
  313. TensorShape shpo;
  314. Elemwise::deduce_shape({shp0, shp1}, shpo);
  315. Tensor<> op0(handle(), {shp0, dtype::Float32()}),
  316. op1(handle(), {shp1, dtype::Float32()}),
  317. out(handle(), {shpo, dtype::Float32()});
  318. auto opr_cur = handle()->create_operator<Elemwise>();
  319. auto opr_naive = naive_handle->create_operator<Elemwise>();
  320. opr_cur->param() = {Elemwise::Mode::ADD};
  321. opr_naive->param() = {Elemwise::Mode::ADD};
  322. auto timeit = [&](Elemwise* opr) {
  323. opr->exec({op0.tensornd(), op1.tensornd()}, out.tensornd());
  324. auto start = clock();
  325. opr->exec({op0.tensornd(), op1.tensornd()}, out.tensornd());
  326. auto stop = clock();
  327. return (stop - start) * 1e3 / CLOCKS_PER_SEC;
  328. };
  329. auto t0 = timeit(opr_cur.get()), t1 = timeit(opr_naive.get());
  330. double tot_size_gb_ms =
  331. (op0.layout().span().dist_byte() + op1.layout().span().dist_byte() +
  332. out.layout().span().dist_byte()) /
  333. 1024.0 / 1024.0 / 1024.0 * 1e3;
  334. printf("%15s+%-15s: fallback=%7.3fms,%5.2fGiB/s "
  335. "naive=%7.3fms,%5.2fGiB/s\n",
  336. shp0.to_string().c_str(), shp1.to_string().c_str(), t0,
  337. tot_size_gb_ms / t0, t1, tot_size_gb_ms / t1);
  338. };
  339. // contig
  340. run({1024, 1024, 32}, {1024, 1024, 32});
  341. // bcast 101
  342. run({1024, 1024, 32}, {1, 1024, 1});
  343. // bcast 01
  344. run({4096 * 4, 1024}, {4096 * 4, 1});
  345. // bcast 10
  346. run({4096 * 4, 1024}, {1, 1024});
  347. // non-contig, fallback to naive
  348. run({1024, 1024, 32}, {1024, 1, 32});
  349. }
  350. #endif
  351. // vim: syntax=cpp.doxygen