You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout_format.cpp 8.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /**
  2. * \file dnn/test/cuda/relayout_format.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/rng.h"
  17. #include "test/cuda/fixture.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. TEST_F(CUDA, RELAYOUT_FORMAT) {
  21. Checker<RelayoutFormat> checker(handle_cuda());
  22. UniformIntRNG rng{-50, 50};
  23. param::RelayoutFormat param;
  24. param.mode = param::RelayoutFormat::Mode::NCHW4_CHWN4;
  25. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  26. .set_dtype(1, dtype::QuantizedS8{0.1f})
  27. .set_rng(0, &rng)
  28. .set_param(param)
  29. .execs({{22, 23, 24, 25, 4}, {}});
  30. param.mode = param::RelayoutFormat::Mode::CHWN4_NCHW4;
  31. checker.execs({{22, 23, 24, 25, 4}, {}});
  32. }
  33. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) {
  34. Checker<RelayoutFormat> checker(handle_cuda());
  35. UniformIntRNG rng{0, 50};
  36. param::RelayoutFormat param;
  37. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  38. for (size_t n : {1, 3}) {
  39. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  40. for (size_t h : {3, 7, 12, 16, 22, 59, 83}) {
  41. for (size_t w : {3, 22, 63, 128, 256}) {
  42. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  43. .set_dtype(1, dtype::QuantizedS8{1.f})
  44. .set_rng(0, &rng)
  45. .set_param(param)
  46. .execs({{n, c, h, w}, {}});
  47. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  48. .set_dtype(1, dtype::QuantizedS8{2.f})
  49. .set_rng(0, &rng)
  50. .set_param(param)
  51. .execs({{n, c, h, w}, {}});
  52. }
  53. }
  54. }
  55. }
  56. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  57. .set_dtype(1, dtype::QuantizedS8{1.f})
  58. .set_rng(0, &rng)
  59. .set_param(param)
  60. .execs({{8, 3, 224, 224}, {}});
  61. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  62. .set_dtype(1, dtype::QuantizedS8{1.f})
  63. .set_rng(0, &rng)
  64. .set_param(param)
  65. .execs({{8, 3, 600, 600}, {}});
  66. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  67. .set_dtype(1, dtype::QuantizedS8{1.f})
  68. .set_rng(0, &rng)
  69. .set_param(param)
  70. .execs({{1, 6, 768, 1280}, {}});
  71. }
  72. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) {
  73. Checker<RelayoutFormat> checker(handle_cuda());
  74. UniformIntRNG rng{0, 50};
  75. param::RelayoutFormat param;
  76. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  77. for (size_t n : {1, 3}) {
  78. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  79. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  80. for (size_t w : {3, 63, 128, 256}) {
  81. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  82. .set_dtype(1, dtype::QuantizedS8{1.f})
  83. .set_rng(0, &rng)
  84. .set_param(param)
  85. .execs({{n, c, h, w}, {}});
  86. }
  87. }
  88. }
  89. }
  90. }
  91. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_U8) {
  92. Checker<RelayoutFormat> checker(handle_cuda());
  93. UniformIntRNG rng{0, 255};
  94. param::RelayoutFormat param;
  95. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  96. for (size_t n : {1, 3}) {
  97. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  98. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  99. for (size_t w : {3, 13, 3 * 4, 63 * 4, 128 * 4, 256 * 4}) {
  100. checker.set_dtype(0, dtype::Uint8())
  101. .set_dtype(1, dtype::QuantizedS8{1.f})
  102. .set_rng(0, &rng)
  103. .set_param(param)
  104. .execs({{n, c, h, w}, {}});
  105. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  106. .set_dtype(1, dtype::QuantizedS8{1.f})
  107. .set_rng(0, &rng)
  108. .set_param(param)
  109. .execs({{n, c, h, w}, {}});
  110. checker.set_dtype(0, dtype::Uint8())
  111. .set_dtype(1, dtype::QuantizedS8{2.5f})
  112. .set_rng(0, &rng)
  113. .set_param(param)
  114. .execs({{n, c, h, w}, {}});
  115. }
  116. }
  117. }
  118. }
  119. }
  120. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
  121. Checker<RelayoutFormat> checker(handle_cuda());
  122. UniformIntRNG rng{0, 50};
  123. param::RelayoutFormat param;
  124. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  125. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  126. .set_dtype(1, dtype::QuantizedS8{1.f})
  127. .set_rng(0, &rng)
  128. .set_param(param)
  129. .execs({{8, 3, 768, 1280}, {}});
  130. }
  131. #if MEGDNN_WITH_BENCHMARK
  132. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
  133. using Param = RelayoutFormat::Param;
  134. auto run = [&](const TensorShapeArray& shapes, Param param,
  135. Param default_param) {
  136. Benchmarker<RelayoutFormat> benchmarker(handle_cuda());
  137. benchmarker.set_param(param);
  138. benchmarker.set_dtype(0, dtype::QuantizedS8{1.f})
  139. .set_dtype(1, dtype::QuantizedS8{1.f});
  140. Benchmarker<RelayoutFormat> benchmarker_default(handle_cuda());
  141. benchmarker_default.set_param(default_param);
  142. benchmarker_default.set_dtype(0, dtype::QuantizedS8{1.f})
  143. .set_dtype(1, dtype::QuantizedS8{1.f});
  144. for (auto&& shape : shapes) {
  145. double memaccess = (double(shape.total_nr_elems()) +
  146. double(shape[0]) * ((shape[1] + 3) / 4 * 4) *
  147. shape[2] * shape[3]) *
  148. 1e-6;
  149. auto time_ms = benchmarker.execs({shape, {}});
  150. if (shape[1] <= 4) {
  151. auto time_default_ms = benchmarker_default.execs({shape, {}});
  152. printf("execute %s, time %.4f ms, %.4f GB/s, default %.4f "
  153. "GB/s\n",
  154. shape.to_string().c_str(), time_ms, memaccess / time_ms,
  155. memaccess / time_default_ms);
  156. } else {
  157. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  158. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  159. }
  160. }
  161. };
  162. TensorShapeArray shapes = {
  163. {8, 1, 768, 1280}, {8, 3, 768, 1280}, {8, 3, 224, 224},
  164. {8, 4, 768, 1280}, {64, 3, 768, 1280},
  165. };
  166. {
  167. Param param;
  168. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  169. Param default_param;
  170. default_param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  171. run(shapes, param, default_param);
  172. }
  173. }
  174. #endif
  175. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
  176. Checker<RelayoutFormat> checker(handle_cuda());
  177. UniformIntRNG rng{-50, 50};
  178. param::RelayoutFormat param;
  179. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  180. for (DType dtype :
  181. std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
  182. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_rng(0, &rng);
  183. checker.set_param(param).execs({{2, 4, 35, 36}, {}});
  184. checker.set_param(param).execs({{2, 3, 35, 36}, {}});
  185. checker.set_param(param).execs({{2, 1, 35, 36}, {}});
  186. param.mode = param::RelayoutFormat::Mode::
  187. NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT;
  188. checker.set_param(param).execs({{4, 3, 3, 3}, {}});
  189. checker.set_param(param).execs({{4, 4, 3, 3}, {}});
  190. checker.set_param(param).execs({{1, 4, 3, 3}, {}});
  191. }
  192. }
  193. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台