You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batch_conv_bias.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. /**
  2. * \file dnn/test/cuda/batch_conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/basic_types.h"
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs/nn.h"
  15. #include "src/common/utils.h"
  16. #include "src/cuda/cudnn_with_check.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/conv_bias.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/fixture.h"
  21. #include "test/cuda/utils.h"
  22. namespace megdnn {
  23. namespace test {
  24. namespace {
  25. struct TestArg {
  26. param::BatchConvBias param;
  27. TensorShape src, filter, bias;
  28. TestArg(param::BatchConvBias param, TensorShape src, TensorShape filter,
  29. TensorShape bias)
  30. : param{param}, src{src}, filter{filter}, bias{bias} {}
  31. };
  32. std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size = 1) {
  33. std::vector<TestArg> args;
  34. using NLMode = param::BatchConvBias::NonlineMode;
  35. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  36. for (size_t b : {1, 2}) {
  37. for (size_t ic : {4, 8, 16}) {
  38. for (size_t oc : {4, 44, 84, 132}) {
  39. for (size_t h : {8, 16}) {
  40. for (size_t w : {4, 8}) {
  41. for (int p :
  42. {0, static_cast<int>(kernel_size / 2)}) {
  43. for (size_t s : {1, 2}) {
  44. size_t f = kernel_size;
  45. param::BatchConvBias param;
  46. param.nonlineMode = nlmode;
  47. param.format =
  48. param::BatchConvBias::Format::NCHW4;
  49. param.sparse =
  50. param::BatchConvBias::Sparse::DENSE;
  51. param.pad_h = param.pad_w = p;
  52. param.stride_h = param.stride_w = s;
  53. args.emplace_back(
  54. param,
  55. TensorShape{b, ic / 4, h, w, 4},
  56. TensorShape{b, oc, ic / 4, f, f, 4},
  57. TensorShape{1, oc / 4, 1, 1, 4});
  58. }
  59. }
  60. }
  61. }
  62. }
  63. }
  64. }
  65. }
  66. return args;
  67. }
  68. std::vector<TestArg> get_int8_nchw4_args_gemm() {
  69. std::vector<TestArg> args;
  70. using NLMode = param::BatchConvBias::NonlineMode;
  71. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  72. for (size_t b : {1, 2}) {
  73. for (size_t ic : {4, 8, 16}) {
  74. for (size_t oc : {32, 64, 128}) {
  75. for (size_t h : {8, 16}) {
  76. for (size_t w : {4, 8}) {
  77. size_t s = 1;
  78. size_t p = 0;
  79. size_t f = 1;
  80. param::BatchConvBias param;
  81. param.nonlineMode = nlmode;
  82. param.format = param::BatchConvBias::Format::NCHW4;
  83. param.sparse = param::BatchConvBias::Sparse::DENSE;
  84. param.pad_h = param.pad_w = p;
  85. param.stride_h = param.stride_w = s;
  86. args.emplace_back(
  87. param, TensorShape{b, ic / 4, h, w, 4},
  88. TensorShape{b, oc, ic / 4, f, f, 4},
  89. TensorShape{1, oc / 4, 1, 1, 4});
  90. }
  91. }
  92. }
  93. }
  94. }
  95. }
  96. return args;
  97. }
  98. std::vector<TestArg> get_int8_nchw4_args_gemm_check_bounds() {
  99. std::vector<TestArg> args;
  100. using NLMode = param::BatchConvBias::NonlineMode;
  101. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  102. for (size_t b : {1, 2}) {
  103. for (size_t ic : {4, 8, 16}) {
  104. for (size_t oc : {4, 40, 80}) {
  105. for (size_t h : {7, 15}) {
  106. for (size_t w : {3, 7}) {
  107. size_t s = 1;
  108. size_t p = 0;
  109. size_t f = 1;
  110. param::BatchConvBias param;
  111. param.nonlineMode = nlmode;
  112. param.format = param::BatchConvBias::Format::NCHW4;
  113. param.sparse = param::BatchConvBias::Sparse::DENSE;
  114. param.pad_h = param.pad_w = p;
  115. param.stride_h = param.stride_w = s;
  116. args.emplace_back(
  117. param, TensorShape{b, ic / 4, h, w, 4},
  118. TensorShape{b, oc, ic / 4, f, f, 4},
  119. TensorShape{1, oc / 4, 1, 1, 4});
  120. }
  121. }
  122. }
  123. }
  124. }
  125. }
  126. return args;
  127. }
  128. void check_batch_conv_bias(DType src_dtype, DType filter_dtype,
  129. DType bias_dtype, DType dst_dtype, Handle* handle,
  130. const char* algo, const std::vector<TestArg>& args) {
  131. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  132. Checker<BatchConvBiasForward> checker(handle);
  133. if (algo) {
  134. checker.set_before_exec_callback(
  135. AlgoChecker<BatchConvBiasForward>(algo));
  136. }
  137. std::unique_ptr<RNG> rng;
  138. std::unique_ptr<RNG> bias_rng;
  139. std::unique_ptr<RNG> const_rng;
  140. // TODO: check range of rng
  141. if (src_dtype.enumv() == DTypeEnum::QuantizedS8) {
  142. rng = std::make_unique<UniformIntRNG>(-3, 3);
  143. const_rng = std::make_unique<UniformIntRNG>(1, 1);
  144. megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32);
  145. bias_rng = std::make_unique<UniformIntRNG>(-50, 50);
  146. checker.set_epsilon(1 + 1e-3)
  147. .set_max_avg_error(1e-1)
  148. .set_max_avg_biased_error(1e-1);
  149. } else if (src_dtype.enumv() == DTypeEnum::Float16) {
  150. rng = std::make_unique<NormalRNG>(2.f);
  151. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float16);
  152. bias_rng = std::make_unique<NormalRNG>(2.f);
  153. checker.set_epsilon(1e-2);
  154. } else if (src_dtype.enumv() == DTypeEnum::Float32) {
  155. rng = std::make_unique<NormalRNG>(2.f);
  156. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float32);
  157. bias_rng = std::make_unique<NormalRNG>(2.f);
  158. }
  159. megdnn_assert(rng != nullptr && bias_rng != nullptr);
  160. checker.set_rng(0, rng.get())
  161. .set_rng(1, rng.get())
  162. .set_rng(2, rng.get())
  163. .set_rng(3, rng.get());
  164. for (auto&& arg : args) {
  165. checker.set_dtype(0, src_dtype)
  166. .set_dtype(1, filter_dtype)
  167. .set_dtype(2, bias_dtype)
  168. .set_dtype(4, dst_dtype)
  169. .set_param(arg.param)
  170. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  171. }
  172. }
  173. #if MEGDNN_WITH_BENCHMARK
  174. struct BenchArgs {
  175. size_t n, ci, hi, wi, co, f, s;
  176. };
  177. std::vector<BenchArgs> get_facerec_bench_args(size_t batch = 64) {
  178. std::vector<BenchArgs> args;
  179. args.emplace_back(BenchArgs{1, 4096, 64, 64, 4096, 1, 1});
  180. args.emplace_back(BenchArgs{batch, 128, 24, 24, 128, 1, 1});
  181. args.emplace_back(BenchArgs{batch, 256, 12, 12, 256, 1, 1});
  182. args.emplace_back(BenchArgs{batch, 512, 6, 6, 512, 1, 1});
  183. args.emplace_back(BenchArgs{batch, 1024, 4, 2, 1024, 1, 1});
  184. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 1});
  185. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 1});
  186. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 1});
  187. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 2});
  188. args.emplace_back(BenchArgs{batch, 192, 16, 16, 192, 1, 1});
  189. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 2});
  190. args.emplace_back(BenchArgs{batch, 384, 8, 8, 384, 1, 1});
  191. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 2});
  192. args.emplace_back(BenchArgs{batch, 640, 4, 4, 640, 1, 1});
  193. return args;
  194. }
  195. void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
  196. DType src_dtype, DType filter_dtype,
  197. DType bias_dtype, DType dst_dtype,
  198. const char* algo = nullptr,
  199. param::BatchConvBias::Format format =
  200. param::BatchConvBias::Format::NCHW4) {
  201. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  202. megdnn_assert(format == param::BatchConvBias::Format::NCHW4);
  203. CUBenchmarker<BatchConvBiasForward> benchmarker(handle);
  204. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  205. CUBenchmarker<BatchedMatrixMul> benchmarker_matmul(handle);
  206. size_t RUNS = 1000;
  207. benchmarker.set_display(false).set_times(RUNS);
  208. benchmarker_cudnn.set_display(false).set_times(RUNS);
  209. benchmarker_matmul.set_display(false).set_times(RUNS);
  210. std::unique_ptr<OprProxy<BatchConvBiasForward>> proxy{
  211. new OprProxy<BatchConvBiasForward>{true}};
  212. if (algo) {
  213. benchmarker.set_before_exec_callback(
  214. AlgoChecker<BatchConvBiasForward>(algo));
  215. } else {
  216. benchmarker.set_proxy(proxy);
  217. }
  218. #define V1(x) #x
  219. #define V(x) V1(x)
  220. #define CUDNN_VERSION_STRING \
  221. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  222. benchmarker_cudnn.set_before_exec_callback(
  223. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  224. "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_"
  225. "GEMM" CUDNN_VERSION_STRING));
  226. benchmarker_matmul.set_before_exec_callback(
  227. AlgoChecker<BatchedMatrixMul>("BRUTE_FORCE-CUBLAS"));
  228. benchmarker.set_dtype(0, src_dtype)
  229. .set_dtype(1, filter_dtype)
  230. .set_dtype(2, bias_dtype)
  231. .set_dtype(3, dst_dtype)
  232. .set_dtype(4, dst_dtype);
  233. benchmarker_cudnn.set_dtype(0, src_dtype)
  234. .set_dtype(1, filter_dtype)
  235. .set_dtype(2, bias_dtype)
  236. .set_dtype(3, dst_dtype)
  237. .set_dtype(4, dst_dtype);
  238. benchmarker_matmul.set_dtype(0, src_dtype)
  239. .set_dtype(1, filter_dtype)
  240. .set_dtype(2, bias_dtype);
  241. using Param = ConvBias::Param;
  242. using Format = Param::Format;
  243. if (format == Format::NCHW4) {
  244. for (auto&& arg : args) {
  245. ConvBias::Param param;
  246. param.pad_h = param.pad_w = arg.f / 2;
  247. param.stride_h = param.stride_w = arg.s;
  248. param.format = Format::NCHW4;
  249. BatchConvBias::Param bparam;
  250. bparam.pad_h = bparam.pad_w = arg.f / 2;
  251. bparam.stride_h = bparam.stride_w = arg.s;
  252. bparam.format = Format::NCHW4;
  253. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  254. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  255. benchmarker.set_param(bparam);
  256. if (!algo) {
  257. benchmarker.proxy()->target_execution_policy.algo.reset();
  258. }
  259. auto time_in_ms =
  260. benchmarker.execs(
  261. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  262. {arg.n, arg.co, arg.ci / 4, arg.f, arg.f, 4},
  263. {1, arg.co / 4, 1, 1, 4},
  264. {},
  265. {}}) /
  266. RUNS;
  267. benchmarker_cudnn.set_param(param);
  268. auto time_in_ms_cudnn =
  269. benchmarker_cudnn.execs(
  270. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  271. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  272. {1, arg.co / 4, 1, 1, 4},
  273. {},
  274. {}}) /
  275. RUNS;
  276. auto time_in_ms_matmul =
  277. benchmarker_matmul.execs(
  278. {{arg.n, arg.co, arg.ci * arg.f * arg.f},
  279. {arg.n, arg.ci * arg.f * arg.f, ho * wo},
  280. {}}) /
  281. RUNS;
  282. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  283. arg.f / (1e12);
  284. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  285. filter{arg.co, arg.ci, arg.f, arg.f};
  286. if (!algo){
  287. algo = "no_name";
  288. }
  289. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  290. "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f "
  291. "%.2fTops, "
  292. "perf(algo=%s)/perf(cudnn)=%.2f\n, "
  293. "perf(algo=%s)/perf(batched_matmul)=%.2f\n",
  294. src.to_string().c_str(), filter.to_string().c_str(), algo,
  295. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  296. (flo / (time_in_ms_cudnn * 1e-3)), time_in_ms_matmul,
  297. (flo / (time_in_ms_matmul * 1e-3)), algo,
  298. time_in_ms_cudnn / time_in_ms, algo,
  299. time_in_ms_matmul / time_in_ms);
  300. }
  301. }
  302. }
  303. #endif
  304. } // namespace
  305. TEST_F(CUDA, BATCH_CONV_BIAS_QS8) {
  306. require_compute_capability(6, 1);
  307. Checker<BatchConvBiasForward> checker(handle_cuda());
  308. checker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(
  309. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD"));
  310. UniformIntRNG const_rng{1, 1};
  311. UniformIntRNG rng{-5, 5};
  312. UniformIntRNG bias_rng{-50, 50};
  313. checker.set_rng(0, &rng)
  314. .set_rng(1, &rng)
  315. .set_rng(2, &rng)
  316. .set_rng(3, &rng)
  317. .set_dtype(0, dtype::QuantizedS8{1.2f})
  318. .set_dtype(1, dtype::QuantizedS8{1.3f})
  319. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  320. .set_dtype(3, dtype::QuantizedS8{1.1f})
  321. .set_dtype(4, dtype::QuantizedS8{1.1f})
  322. .set_epsilon(1 + 1e-3)
  323. .set_max_avg_error(1e-1)
  324. .set_max_avg_biased_error(1e-1);
  325. param::BatchConvBias param;
  326. param.pad_h = 2, param.pad_w = 1;
  327. param.stride_h = 1, param.stride_w = 2;
  328. param.format = param::BatchConvBias::Format::NCHW4;
  329. checker.set_param(param).execs({{32, 4, 24, 24, 4},
  330. {32, 32, 4, 1, 1, 4},
  331. {1, 8, 1, 1, 4},
  332. {},
  333. {}});
  334. }
  335. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM) {
  336. require_compute_capability(6, 1);
  337. check_batch_conv_bias(dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  338. dtype::QuantizedS32{1.2f * 1.3f},
  339. dtype::QuantizedS8{1.1f}, handle_cuda(),
  340. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD",
  341. get_int8_nchw4_args_gemm());
  342. }
  343. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM_CHECK_BOUNDS) {
  344. require_compute_capability(6, 1);
  345. check_batch_conv_bias(dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  346. dtype::QuantizedS32{1.2f * 1.3f},
  347. dtype::QuantizedS8{1.1f}, handle_cuda(),
  348. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD",
  349. get_int8_nchw4_args_gemm_check_bounds());
  350. }
  351. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_IMPLICIT_GEMM) {
  352. require_compute_capability(6, 1);
  353. check_batch_conv_bias(
  354. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  355. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  356. handle_cuda(),
  357. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD",
  358. get_int8_nchw4_args(1));
  359. }
  360. #if MEGDNN_WITH_BENCHMARK
  361. TEST_F(CUDA, BENCHMARK_BATCH_CONV_BIAS_QS8) {
  362. require_compute_capability(6, 1);
  363. benchmark_target_algo(handle_cuda(), get_facerec_bench_args(128),
  364. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  365. dtype::QuantizedS32{1.2f * 1.3f},
  366. dtype::QuantizedS8{1.0f}, nullptr,
  367. param::ConvBias::Format::NCHW4);
  368. }
  369. #endif
  370. } // namespace test
  371. } // namespace megdnn
  372. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台