You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

reduce.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. #include "test/fallback/fixture.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/task_record_check.h"
  6. #include "test/common/tensor.h"
  7. #include "test/common/workspace_wrapper.h"
  8. using namespace megdnn;
  9. using namespace test;
  10. TEST_F(FALLBACK, REDUCE_FULL) {
  11. using Param = Reduce::Param;
  12. using Mode = Param::Mode;
  13. Checker<Reduce> checker(handle());
  14. UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1};
  15. checker.set_rng(0, &rng);
  16. struct Config {
  17. Param param;
  18. DType dtype;
  19. TensorShape shape;
  20. Config(Param param, DType dtype, TensorShape shape)
  21. : param(param), dtype(dtype), shape(shape) {}
  22. };
  23. std::vector<Config> configs;
  24. for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN})
  25. for (auto dtype : std::vector<DType>{
  26. dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
  27. dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
  28. for (int32_t axis : {0, 1, 2}) {
  29. for (size_t A : {1, 3, 5, 20}) {
  30. for (size_t B : {4, 6, 9, 16, 33, 45}) {
  31. for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
  32. TensorShape shape{A, B, C};
  33. Param param(mode, axis);
  34. Config config(param, dtype, shape);
  35. configs.push_back(config);
  36. }
  37. }
  38. }
  39. }
  40. for (auto&& config : configs) {
  41. auto&& dtype = config.dtype;
  42. auto&& param = config.param;
  43. auto&& shape = config.shape;
  44. checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
  45. }
  46. configs.clear();
  47. for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
  48. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
  49. for (int32_t axis : {0, 1, 2}) {
  50. for (size_t A : {1, 3, 5, 20}) {
  51. for (size_t B : {4, 6, 9, 16, 33, 45}) {
  52. for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
  53. TensorShape shape{A, B, C};
  54. Param param(mode, axis);
  55. Config config(param, dtype, shape);
  56. configs.push_back(config);
  57. }
  58. }
  59. }
  60. }
  61. UniformFloatRNG rng_float(-2, 2);
  62. checker.set_rng(0, &rng_float);
  63. checker.set_epsilon(1e-1);
  64. for (auto&& config : configs) {
  65. auto&& dtype = config.dtype;
  66. auto&& param = config.param;
  67. auto&& shape = config.shape;
  68. if (dtype == dtype::Float16())
  69. checker.set_epsilon(1e-1);
  70. else
  71. checker.set_epsilon(1e-3);
  72. checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
  73. }
  74. }
  75. TEST_F(FALLBACK, REDUCE) {
  76. using Param = Reduce::Param;
  77. using Mode = Param::Mode;
  78. using DataType = Param::DataType;
  79. Checker<Reduce> checker(handle());
  80. struct Config {
  81. Param param;
  82. DType dtype;
  83. TensorShape shape;
  84. Config(Param param, DType dtype, TensorShape shape)
  85. : param(param), dtype(dtype), shape(shape) {}
  86. };
  87. std::vector<Config> configs;
  88. // general
  89. for (auto mode :
  90. {Mode::SUM, Mode::MEAN, Mode::SUM_SQR, Mode::PRODUCT, Mode::MIN, Mode::MAX})
  91. for (auto dtype : std::vector<DType>{
  92. dtype::Float16(), dtype::Float32(), dtype::Int32(), dtype::Int16(),
  93. dtype::Int8(), dtype::Uint8()})
  94. for (int32_t axis : {0, 1, 2, 3}) {
  95. TensorShape shape{2, 3, 20, 5};
  96. Param param(mode, axis);
  97. Config config(param, dtype, shape);
  98. configs.push_back(config);
  99. if (dtype.category() == DTypeCategory::FLOAT) {
  100. Param param(mode, axis, DataType::FLOAT_O16xC32);
  101. Config config(param, dtype, shape);
  102. configs.push_back(config);
  103. param.data_type = DataType::FLOAT_O32xC32;
  104. config = Config(param, dtype, shape);
  105. configs.push_back(config);
  106. } else if (dtype == dtype::Int32()) {
  107. Param param(mode, axis, DataType::FLOAT_O32xC32);
  108. Config config(param, dtype, shape);
  109. configs.push_back(config);
  110. }
  111. }
  112. // large (ABC) -> (A1C) case
  113. for (auto mode : {Mode::SUM_SQR})
  114. for (auto dtype : std::vector<DType>{dtype::Int32()})
  115. for (int32_t axis : {0, 1, 2, 3}) {
  116. TensorShape shape{2, 3, 10000, 5};
  117. Param param(mode, axis);
  118. Config config(param, dtype, shape);
  119. configs.push_back(config);
  120. }
  121. // large (AB) -> (A1) case
  122. for (auto mode : {Mode::SUM_SQR})
  123. for (auto dtype : std::vector<DType>{dtype::Int32()})
  124. for (int32_t axis : {0, 1, 2, 3}) {
  125. TensorShape shape{2, 3, 5, 10000};
  126. Param param(mode, axis);
  127. Config config(param, dtype, shape);
  128. configs.push_back(config);
  129. }
  130. {
  131. // large reduce_mean for O16C32
  132. TensorShape shape{1, 65536, 5};
  133. Param param(Mode::MEAN, 1, DataType::FLOAT_O16xC32);
  134. Config config(param, dtype::Float16(), shape);
  135. configs.push_back(config);
  136. }
  137. for (auto&& config : configs) {
  138. auto&& dtype = config.dtype;
  139. auto&& param = config.param;
  140. auto&& mode = config.param.mode;
  141. auto&& shape = config.shape;
  142. auto&& data_type = config.param.data_type;
  143. // when input/output both float16, the internal compute is float16, mode
  144. // is SUM or SUM_SQR, need set epsilon to 1e-2 to pass test
  145. if (dtype == dtype::Float16() && data_type == DataType::DEFAULT &&
  146. (mode == Mode::SUM || mode == Mode::SUM_SQR)) {
  147. checker.set_epsilon(1e-2);
  148. }
  149. checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
  150. }
  151. {
  152. static size_t N = 1 << 26;
  153. {
  154. // cpu vs naive
  155. Checker<Reduce> checker(handle());
  156. Reduce::Param param;
  157. param.axis = 0;
  158. UniformFloatRNG rng(1, 1);
  159. checker.set_param(param);
  160. checker.set_rng(0, &rng);
  161. checker.execs({{N}, {}});
  162. }
  163. {
  164. // naive vs groundtruth
  165. TensorLayout layoutN(TensorShape{N}, dtype::Float32()),
  166. layout1(TensorShape{1}, dtype::Float32());
  167. auto handle = this->handle();
  168. Tensor<float> src(handle, layoutN), dst(handle, layout1);
  169. float* ptr = src.ptr();
  170. for (size_t i = 0; i < N; ++i)
  171. ptr[i] = 1;
  172. auto opr = handle->create_operator<Reduce>();
  173. opr->param().axis = 0;
  174. auto wsize = opr->get_workspace_in_bytes(layoutN, layout1);
  175. WorkspaceWrapper workspace(handle, wsize);
  176. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  177. megdnn_sync(handle);
  178. ASSERT_EQ(N, dst.ptr()[0]);
  179. }
  180. }
  181. }
  182. TEST_F(FALLBACK, REDUCE_RECORD) {
  183. using Param = Reduce::Param;
  184. using Mode = Param::Mode;
  185. using DataType = Param::DataType;
  186. TaskRecordChecker<Reduce> checker(1);
  187. struct Config {
  188. Param param;
  189. DType dtype;
  190. TensorShape shape;
  191. Config(Param param, DType dtype, TensorShape shape)
  192. : param(param), dtype(dtype), shape(shape) {}
  193. };
  194. std::vector<Config> configs;
  195. // general
  196. for (auto mode :
  197. {Mode::SUM, Mode::MEAN, Mode::SUM_SQR, Mode::PRODUCT, Mode::MIN, Mode::MAX})
  198. for (auto dtype : std::vector<DType>{
  199. dtype::Float16(), dtype::Float32(), dtype::Int32(), dtype::Int16(),
  200. dtype::Int8(), dtype::Uint8()})
  201. for (int32_t axis : {0, 1, 2, 3}) {
  202. TensorShape shape{2, 3, 20, 5};
  203. Param param(mode, axis);
  204. Config config(param, dtype, shape);
  205. configs.push_back(config);
  206. if (dtype.category() == DTypeCategory::FLOAT) {
  207. Param param(mode, axis, DataType::FLOAT_O16xC32);
  208. Config config(param, dtype, shape);
  209. configs.push_back(config);
  210. param.data_type = DataType::FLOAT_O32xC32;
  211. config = Config(param, dtype, shape);
  212. configs.push_back(config);
  213. } else if (dtype == dtype::Int32()) {
  214. Param param(mode, axis, DataType::FLOAT_O32xC32);
  215. Config config(param, dtype, shape);
  216. configs.push_back(config);
  217. }
  218. }
  219. // large (ABC) -> (A1C) case
  220. for (auto mode : {Mode::SUM_SQR})
  221. for (auto dtype : std::vector<DType>{dtype::Int32()})
  222. for (int32_t axis : {0, 1, 2, 3}) {
  223. TensorShape shape{2, 3, 10000, 5};
  224. Param param(mode, axis);
  225. Config config(param, dtype, shape);
  226. configs.push_back(config);
  227. }
  228. // large (AB) -> (A1) case
  229. for (auto mode : {Mode::SUM_SQR})
  230. for (auto dtype : std::vector<DType>{dtype::Int32()})
  231. for (int32_t axis : {0, 1, 2, 3}) {
  232. TensorShape shape{2, 3, 5, 10000};
  233. Param param(mode, axis);
  234. Config config(param, dtype, shape);
  235. configs.push_back(config);
  236. }
  237. {
  238. // large reduce_mean for O16C32
  239. TensorShape shape{1, 65536, 5};
  240. Param param(Mode::MEAN, 1, DataType::FLOAT_O16xC32);
  241. Config config(param, dtype::Float16(), shape);
  242. configs.push_back(config);
  243. }
  244. for (auto&& config : configs) {
  245. auto&& dtype = config.dtype;
  246. auto&& param = config.param;
  247. auto&& mode = config.param.mode;
  248. auto&& shape = config.shape;
  249. auto&& data_type = config.param.data_type;
  250. // when input/output both float16, the internal compute is float16, mode
  251. // is SUM or SUM_SQR, need set epsilon to 1e-2 to pass test
  252. if (dtype == dtype::Float16() && data_type == DataType::DEFAULT &&
  253. (mode == Mode::SUM || mode == Mode::SUM_SQR)) {
  254. checker.set_epsilon(1e-2);
  255. }
  256. checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
  257. }
  258. {
  259. static size_t N = 1 << 26;
  260. {
  261. // cpu vs naive
  262. TaskRecordChecker<Reduce> checker(1);
  263. Reduce::Param param;
  264. param.axis = 0;
  265. UniformFloatRNG rng(1, 1);
  266. checker.set_param(param);
  267. checker.set_rng(0, &rng);
  268. checker.execs({{N}, {}});
  269. }
  270. {
  271. // naive vs groundtruth
  272. TensorLayout layoutN(TensorShape{N}, dtype::Float32()),
  273. layout1(TensorShape{1}, dtype::Float32());
  274. auto handle = this->handle();
  275. Tensor<float> src(handle, layoutN), dst(handle, layout1);
  276. float* ptr = src.ptr();
  277. for (size_t i = 0; i < N; ++i)
  278. ptr[i] = 1;
  279. auto opr = handle->create_operator<Reduce>();
  280. opr->param().axis = 0;
  281. auto wsize = opr->get_workspace_in_bytes(layoutN, layout1);
  282. WorkspaceWrapper workspace(handle, wsize);
  283. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  284. megdnn_sync(handle);
  285. ASSERT_EQ(N, dst.ptr()[0]);
  286. }
  287. }
  288. }
  289. #if MEGDNN_WITH_BENCHMARK
  290. TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
  291. auto run = [&]() {
  292. Benchmarker<Reduce> benchmarker_reduce(handle());
  293. Benchmarker<Convolution> benchmarker_conv(handle());
  294. benchmarker_reduce.set_display(false);
  295. benchmarker_conv.set_display(false);
  296. constexpr size_t RUNS = 50;
  297. benchmarker_reduce.set_times(RUNS);
  298. benchmarker_conv.set_times(RUNS);
  299. param::Reduce param;
  300. param.axis = 3;
  301. param.mode = param::Reduce::Mode::SUM;
  302. benchmarker_reduce.set_param(param);
  303. param::Convolution param_conv;
  304. benchmarker_conv.set_param(param_conv);
  305. {
  306. TensorLayout src({24, 240, 128, 2}, dtype::Float32());
  307. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  308. TensorLayout conv_src({24, 2, 240, 128}, dtype::Float32());
  309. TensorLayout conv_weight({1, 2, 1, 1}, dtype::Float32());
  310. auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
  311. printf("case 1: reduce use time %fms, convolution use time %fms\n", reduce,
  312. conv);
  313. }
  314. {
  315. TensorLayout src({24, 240, 128, 3}, dtype::Float32());
  316. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  317. TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32());
  318. TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32());
  319. auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
  320. printf("case 2: reduce use time %fms, convolution use time %fms\n", reduce,
  321. conv);
  322. }
  323. {
  324. TensorLayout src({24, 240, 128, 4}, dtype::Float32());
  325. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  326. TensorLayout conv_src({24, 4, 240, 128}, dtype::Float32());
  327. TensorLayout conv_weight({1, 4, 1, 1}, dtype::Float32());
  328. auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
  329. printf("case 3: reduce use time %fms, convolution use time %fms\n", reduce,
  330. conv);
  331. }
  332. };
  333. run();
  334. }
  335. TEST_F(FALLBACK, BENCHMARK_REDUCE) {
  336. auto run = [&]() {
  337. Benchmarker<Reduce> benchmarker_reduce(handle());
  338. benchmarker_reduce.set_display(false);
  339. using Mode = param::Reduce::Mode;
  340. constexpr size_t RUNS = 100;
  341. benchmarker_reduce.set_times(RUNS);
  342. TensorShape small{3 * 224 * 224};
  343. TensorShape large{3 * 224 * 224 * 100};
  344. param::Reduce param;
  345. param.axis = 0;
  346. for (auto i = 224; i < 224 * 2; i++) {
  347. for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
  348. param.mode = mode;
  349. benchmarker_reduce.set_param(param);
  350. auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
  351. }
  352. }
  353. param.mode = param::Reduce::Mode::SUM;
  354. benchmarker_reduce.set_param(param);
  355. printf("SUM\n");
  356. {
  357. TensorLayout src(small, dtype::Float32());
  358. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  359. printf("case 1: reduce use time %fms\n", reduce);
  360. }
  361. {
  362. TensorLayout src(large, dtype::Float32());
  363. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  364. printf("case 1: reduce use time %fms\n", reduce);
  365. }
  366. param.mode = param::Reduce::Mode::MEAN;
  367. benchmarker_reduce.set_param(param);
  368. printf("MEAN\n");
  369. {
  370. TensorLayout src(small, dtype::Float32());
  371. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  372. printf("case 2: reduce use time %fms\n", reduce);
  373. }
  374. {
  375. TensorLayout src(large, dtype::Float32());
  376. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  377. printf("case 2: reduce use time %fms\n", reduce);
  378. }
  379. param.mode = param::Reduce::Mode::SUM_SQR;
  380. benchmarker_reduce.set_param(param);
  381. printf("SUM_SQR\n");
  382. {
  383. TensorLayout src(small, dtype::Float32());
  384. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  385. printf("case 3: reduce use time %fms\n", reduce);
  386. }
  387. {
  388. TensorLayout src(large, dtype::Float32());
  389. auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
  390. printf("case 3: reduce use time %fms\n", reduce);
  391. }
  392. };
  393. run();
  394. }
  395. #endif
  396. // vim: syntax=cpp.doxygen