You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batch_conv_bias.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /**
  2. * \file dnn/test/cuda/batch_conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/basic_types.h"
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs/nn.h"
  15. #include "src/common/utils.h"
  16. #include "src/cuda/cudnn_with_check.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/conv_bias.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/fixture.h"
  21. #include "test/cuda/utils.h"
  22. namespace megdnn {
  23. namespace test {
  24. namespace {
  25. struct TestArg {
  26. param::BatchConvBias param;
  27. TensorShape src, filter, bias;
  28. TestArg(param::BatchConvBias param, TensorShape src, TensorShape filter,
  29. TensorShape bias)
  30. : param{param}, src{src}, filter{filter}, bias{bias} {}
  31. };
  32. std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size = 1) {
  33. std::vector<TestArg> args;
  34. using NLMode = param::BatchConvBias::NonlineMode;
  35. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  36. for (size_t b : {1, 2}) {
  37. for (size_t ic : {4, 8, 16}) {
  38. for (size_t oc : {4, 44, 84, 132}) {
  39. for (size_t h : {8, 16}) {
  40. for (size_t w : {4, 8}) {
  41. for (int p : {0, static_cast<int>(kernel_size / 2)}) {
  42. for (size_t s : {1, 2}) {
  43. size_t f = kernel_size;
  44. param::BatchConvBias param;
  45. param.nonlineMode = nlmode;
  46. param.format = param::BatchConvBias::Format::NCHW4;
  47. param.sparse = param::BatchConvBias::Sparse::DENSE;
  48. param.pad_h = param.pad_w = p;
  49. param.stride_h = param.stride_w = s;
  50. args.emplace_back(
  51. param, TensorShape{b, ic / 4, h, w, 4},
  52. TensorShape{b, oc, ic / 4, f, f, 4},
  53. TensorShape{1, oc / 4, 1, 1, 4});
  54. }
  55. }
  56. }
  57. }
  58. }
  59. }
  60. }
  61. }
  62. return args;
  63. }
  64. std::vector<TestArg> get_int8_nchw4_args_gemm() {
  65. std::vector<TestArg> args;
  66. using NLMode = param::BatchConvBias::NonlineMode;
  67. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  68. for (size_t b : {1, 2}) {
  69. for (size_t ic : {4, 8, 16}) {
  70. for (size_t oc : {32, 64, 128}) {
  71. for (size_t h : {8, 16}) {
  72. for (size_t w : {4, 8}) {
  73. size_t s = 1;
  74. size_t p = 0;
  75. size_t f = 1;
  76. param::BatchConvBias param;
  77. param.nonlineMode = nlmode;
  78. param.format = param::BatchConvBias::Format::NCHW4;
  79. param.sparse = param::BatchConvBias::Sparse::DENSE;
  80. param.pad_h = param.pad_w = p;
  81. param.stride_h = param.stride_w = s;
  82. args.emplace_back(
  83. param, TensorShape{b, ic / 4, h, w, 4},
  84. TensorShape{b, oc, ic / 4, f, f, 4},
  85. TensorShape{1, oc / 4, 1, 1, 4});
  86. }
  87. }
  88. }
  89. }
  90. }
  91. }
  92. return args;
  93. }
  94. std::vector<TestArg> get_int8_nchw4_args_gemm_check_bounds() {
  95. std::vector<TestArg> args;
  96. using NLMode = param::BatchConvBias::NonlineMode;
  97. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  98. for (size_t b : {1, 2}) {
  99. for (size_t ic : {4, 8, 16}) {
  100. for (size_t oc : {4, 40, 80}) {
  101. for (size_t h : {7, 15}) {
  102. for (size_t w : {3, 7}) {
  103. size_t s = 1;
  104. size_t p = 0;
  105. size_t f = 1;
  106. param::BatchConvBias param;
  107. param.nonlineMode = nlmode;
  108. param.format = param::BatchConvBias::Format::NCHW4;
  109. param.sparse = param::BatchConvBias::Sparse::DENSE;
  110. param.pad_h = param.pad_w = p;
  111. param.stride_h = param.stride_w = s;
  112. args.emplace_back(
  113. param, TensorShape{b, ic / 4, h, w, 4},
  114. TensorShape{b, oc, ic / 4, f, f, 4},
  115. TensorShape{1, oc / 4, 1, 1, 4});
  116. }
  117. }
  118. }
  119. }
  120. }
  121. }
  122. return args;
  123. }
  124. void check_batch_conv_bias(
  125. DType src_dtype, DType filter_dtype, DType bias_dtype, DType dst_dtype,
  126. Handle* handle, const char* algo, const std::vector<TestArg>& args) {
  127. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  128. Checker<BatchConvBiasForward> checker(handle);
  129. if (algo) {
  130. checker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(algo));
  131. }
  132. std::unique_ptr<RNG> rng;
  133. std::unique_ptr<RNG> bias_rng;
  134. std::unique_ptr<RNG> const_rng;
  135. // TODO: check range of rng
  136. if (src_dtype.enumv() == DTypeEnum::QuantizedS8) {
  137. rng = std::make_unique<UniformIntRNG>(-3, 3);
  138. const_rng = std::make_unique<UniformIntRNG>(1, 1);
  139. megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32);
  140. bias_rng = std::make_unique<UniformIntRNG>(-50, 50);
  141. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1).set_max_avg_biased_error(
  142. 1e-1);
  143. } else if (src_dtype.enumv() == DTypeEnum::Float16) {
  144. rng = std::make_unique<NormalRNG>(2.f);
  145. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float16);
  146. bias_rng = std::make_unique<NormalRNG>(2.f);
  147. checker.set_epsilon(1e-2);
  148. } else if (src_dtype.enumv() == DTypeEnum::Float32) {
  149. rng = std::make_unique<NormalRNG>(2.f);
  150. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float32);
  151. bias_rng = std::make_unique<NormalRNG>(2.f);
  152. }
  153. megdnn_assert(rng != nullptr && bias_rng != nullptr);
  154. checker.set_rng(0, rng.get())
  155. .set_rng(1, rng.get())
  156. .set_rng(2, rng.get())
  157. .set_rng(3, rng.get());
  158. for (auto&& arg : args) {
  159. checker.set_dtype(0, src_dtype)
  160. .set_dtype(1, filter_dtype)
  161. .set_dtype(2, bias_dtype)
  162. .set_dtype(4, dst_dtype)
  163. .set_param(arg.param)
  164. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  165. }
  166. }
  167. #if MEGDNN_WITH_BENCHMARK
  168. struct BenchArgs {
  169. size_t n, ci, hi, wi, co, f, s;
  170. };
  171. std::vector<BenchArgs> get_facerec_bench_args(size_t batch = 64) {
  172. std::vector<BenchArgs> args;
  173. args.emplace_back(BenchArgs{1, 4096, 64, 64, 4096, 1, 1});
  174. args.emplace_back(BenchArgs{batch, 128, 24, 24, 128, 1, 1});
  175. args.emplace_back(BenchArgs{batch, 256, 12, 12, 256, 1, 1});
  176. args.emplace_back(BenchArgs{batch, 512, 6, 6, 512, 1, 1});
  177. args.emplace_back(BenchArgs{batch, 1024, 4, 2, 1024, 1, 1});
  178. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 1});
  179. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 1});
  180. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 1});
  181. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 2});
  182. args.emplace_back(BenchArgs{batch, 192, 16, 16, 192, 1, 1});
  183. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 2});
  184. args.emplace_back(BenchArgs{batch, 384, 8, 8, 384, 1, 1});
  185. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 2});
  186. args.emplace_back(BenchArgs{batch, 640, 4, 4, 640, 1, 1});
  187. return args;
  188. }
  189. void benchmark_target_algo(
  190. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  191. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  192. const char* algo = nullptr,
  193. param::BatchConvBias::Format format = param::BatchConvBias::Format::NCHW4) {
  194. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  195. megdnn_assert(format == param::BatchConvBias::Format::NCHW4);
  196. CUBenchmarker<BatchConvBiasForward> benchmarker(handle);
  197. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  198. CUBenchmarker<BatchedMatrixMul> benchmarker_matmul(handle);
  199. size_t RUNS = 1000;
  200. benchmarker.set_display(false).set_times(RUNS);
  201. benchmarker_cudnn.set_display(false).set_times(RUNS);
  202. benchmarker_matmul.set_display(false).set_times(RUNS);
  203. std::unique_ptr<OprProxy<BatchConvBiasForward>> proxy{
  204. new OprProxy<BatchConvBiasForward>{true}};
  205. if (algo) {
  206. benchmarker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(algo));
  207. } else {
  208. benchmarker.set_proxy(proxy);
  209. }
  210. #define V1(x) #x
  211. #define V(x) V1(x)
  212. #define CUDNN_VERSION_STRING \
  213. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  214. benchmarker_cudnn.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  216. ConvBiasForward::algo_name<ConvBias::DefaultParam>(
  217. "CUDNN:ConvBiasActivation:"
  218. "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_"
  219. "GEMM" CUDNN_VERSION_STRING,
  220. {})
  221. .c_str()));
  222. benchmarker_matmul.set_before_exec_callback(AlgoChecker<BatchedMatrixMul>(
  223. ExecutionPolicyAlgoName{"BRUTE_FORCE", {{"CUBLAS", {}}}}));
  224. benchmarker.set_dtype(0, src_dtype)
  225. .set_dtype(1, filter_dtype)
  226. .set_dtype(2, bias_dtype)
  227. .set_dtype(3, dst_dtype)
  228. .set_dtype(4, dst_dtype);
  229. benchmarker_cudnn.set_dtype(0, src_dtype)
  230. .set_dtype(1, filter_dtype)
  231. .set_dtype(2, bias_dtype)
  232. .set_dtype(3, dst_dtype)
  233. .set_dtype(4, dst_dtype);
  234. benchmarker_matmul.set_dtype(0, src_dtype)
  235. .set_dtype(1, filter_dtype)
  236. .set_dtype(2, bias_dtype);
  237. using Param = ConvBias::Param;
  238. using Format = Param::Format;
  239. if (format == Format::NCHW4) {
  240. for (auto&& arg : args) {
  241. ConvBias::Param param;
  242. param.pad_h = param.pad_w = arg.f / 2;
  243. param.stride_h = param.stride_w = arg.s;
  244. param.format = Format::NCHW4;
  245. BatchConvBias::Param bparam;
  246. bparam.pad_h = bparam.pad_w = arg.f / 2;
  247. bparam.stride_h = bparam.stride_w = arg.s;
  248. bparam.format = Format::NCHW4;
  249. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  250. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  251. benchmarker.set_param(bparam);
  252. if (!algo) {
  253. benchmarker.proxy()->target_execution_policy.algo.reset();
  254. }
  255. auto time_in_ms = benchmarker.execs(
  256. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  257. {arg.n, arg.co, arg.ci / 4, arg.f, arg.f, 4},
  258. {1, arg.co / 4, 1, 1, 4},
  259. {},
  260. {}}) /
  261. RUNS;
  262. benchmarker_cudnn.set_param(param);
  263. auto time_in_ms_cudnn = benchmarker_cudnn.execs(
  264. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  265. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  266. {1, arg.co / 4, 1, 1, 4},
  267. {},
  268. {}}) /
  269. RUNS;
  270. auto time_in_ms_matmul = benchmarker_matmul.execs(
  271. {{arg.n, arg.co, arg.ci * arg.f * arg.f},
  272. {arg.n, arg.ci * arg.f * arg.f, ho * wo},
  273. {}}) /
  274. RUNS;
  275. float flo =
  276. 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  277. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  278. filter{arg.co, arg.ci, arg.f, arg.f};
  279. if (!algo) {
  280. algo = "no_name";
  281. }
  282. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  283. "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f "
  284. "%.2fTops, "
  285. "perf(algo=%s)/perf(cudnn)=%.2f\n, "
  286. "perf(algo=%s)/perf(batched_matmul)=%.2f\n",
  287. src.to_string().c_str(), filter.to_string().c_str(), algo,
  288. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  289. (flo / (time_in_ms_cudnn * 1e-3)), time_in_ms_matmul,
  290. (flo / (time_in_ms_matmul * 1e-3)), algo,
  291. time_in_ms_cudnn / time_in_ms, algo, time_in_ms_matmul / time_in_ms);
  292. }
  293. }
  294. }
  295. #endif
  296. } // namespace
  297. TEST_F(CUDA, BATCH_CONV_BIAS_QS8) {
  298. require_compute_capability(6, 1);
  299. Checker<BatchConvBiasForward> checker(handle_cuda());
  300. checker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(
  301. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD"));
  302. UniformIntRNG const_rng{1, 1};
  303. UniformIntRNG rng{-5, 5};
  304. UniformIntRNG bias_rng{-50, 50};
  305. checker.set_rng(0, &rng)
  306. .set_rng(1, &rng)
  307. .set_rng(2, &rng)
  308. .set_rng(3, &rng)
  309. .set_dtype(0, dtype::QuantizedS8{1.2f})
  310. .set_dtype(1, dtype::QuantizedS8{1.3f})
  311. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  312. .set_dtype(3, dtype::QuantizedS8{1.1f})
  313. .set_dtype(4, dtype::QuantizedS8{1.1f})
  314. .set_epsilon(1 + 1e-3)
  315. .set_max_avg_error(1e-1)
  316. .set_max_avg_biased_error(1e-1);
  317. param::BatchConvBias param;
  318. param.pad_h = 2, param.pad_w = 1;
  319. param.stride_h = 1, param.stride_w = 2;
  320. param.format = param::BatchConvBias::Format::NCHW4;
  321. checker.set_param(param).execs(
  322. {{32, 4, 24, 24, 4}, {32, 32, 4, 1, 1, 4}, {1, 8, 1, 1, 4}, {}, {}});
  323. }
  324. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM) {
  325. require_compute_capability(6, 1);
  326. check_batch_conv_bias(
  327. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  328. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  329. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD", get_int8_nchw4_args_gemm());
  330. }
  331. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM_CHECK_BOUNDS) {
  332. require_compute_capability(6, 1);
  333. check_batch_conv_bias(
  334. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  335. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  336. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD",
  337. get_int8_nchw4_args_gemm_check_bounds());
  338. }
  339. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_IMPLICIT_GEMM) {
  340. require_compute_capability(6, 1);
  341. check_batch_conv_bias(
  342. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  343. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  344. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD",
  345. get_int8_nchw4_args(1));
  346. }
  347. #if MEGDNN_WITH_BENCHMARK
  348. TEST_F(CUDA, BENCHMARK_BATCH_CONV_BIAS_QS8) {
  349. require_compute_capability(6, 1);
  350. benchmark_target_algo(
  351. handle_cuda(), get_facerec_bench_args(128), dtype::QuantizedS8{1.2f},
  352. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  353. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::NCHW4);
  354. }
  355. #endif
  356. } // namespace test
  357. } // namespace megdnn
  358. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台