You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batch_conv_bias.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. #include "megdnn/basic_types.h"
  2. #include "megdnn/dtype.h"
  3. #include "megdnn/opr_param_defs.h"
  4. #include "megdnn/oprs/nn.h"
  5. #include "src/common/utils.h"
  6. #include "src/cuda/cudnn_with_check.h"
  7. #include "test/common/checker.h"
  8. #include "test/common/conv_bias.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. namespace megdnn {
  13. namespace test {
  14. namespace {
  15. struct TestArg {
  16. param::BatchConvBias param;
  17. TensorShape src, filter, bias;
  18. TestArg(param::BatchConvBias param, TensorShape src, TensorShape filter,
  19. TensorShape bias)
  20. : param{param}, src{src}, filter{filter}, bias{bias} {}
  21. };
  22. std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size = 1) {
  23. std::vector<TestArg> args;
  24. using NLMode = param::BatchConvBias::NonlineMode;
  25. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  26. for (size_t b : {1, 2}) {
  27. for (size_t ic : {4, 8, 16}) {
  28. for (size_t oc : {4, 44, 84, 132}) {
  29. for (size_t h : {8, 16}) {
  30. for (size_t w : {4, 8}) {
  31. for (int p : {0, static_cast<int>(kernel_size / 2)}) {
  32. for (size_t s : {1, 2}) {
  33. size_t f = kernel_size;
  34. param::BatchConvBias param;
  35. param.nonlineMode = nlmode;
  36. param.format = param::BatchConvBias::Format::NCHW4;
  37. param.sparse = param::BatchConvBias::Sparse::DENSE;
  38. param.pad_h = param.pad_w = p;
  39. param.stride_h = param.stride_w = s;
  40. args.emplace_back(
  41. param, TensorShape{b, ic / 4, h, w, 4},
  42. TensorShape{b, oc, ic / 4, f, f, 4},
  43. TensorShape{1, oc / 4, 1, 1, 4});
  44. }
  45. }
  46. }
  47. }
  48. }
  49. }
  50. }
  51. }
  52. return args;
  53. }
  54. std::vector<TestArg> get_int8_nchw4_args_gemm() {
  55. std::vector<TestArg> args;
  56. using NLMode = param::BatchConvBias::NonlineMode;
  57. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  58. for (size_t b : {1, 2}) {
  59. for (size_t ic : {4, 8, 16}) {
  60. for (size_t oc : {32, 64, 128}) {
  61. for (size_t h : {8, 16}) {
  62. for (size_t w : {4, 8}) {
  63. size_t s = 1;
  64. size_t p = 0;
  65. size_t f = 1;
  66. param::BatchConvBias param;
  67. param.nonlineMode = nlmode;
  68. param.format = param::BatchConvBias::Format::NCHW4;
  69. param.sparse = param::BatchConvBias::Sparse::DENSE;
  70. param.pad_h = param.pad_w = p;
  71. param.stride_h = param.stride_w = s;
  72. args.emplace_back(
  73. param, TensorShape{b, ic / 4, h, w, 4},
  74. TensorShape{b, oc, ic / 4, f, f, 4},
  75. TensorShape{1, oc / 4, 1, 1, 4});
  76. }
  77. }
  78. }
  79. }
  80. }
  81. }
  82. return args;
  83. }
  84. std::vector<TestArg> get_int8_nchw4_args_gemm_check_bounds() {
  85. std::vector<TestArg> args;
  86. using NLMode = param::BatchConvBias::NonlineMode;
  87. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::H_SWISH}) {
  88. for (size_t b : {1, 2}) {
  89. for (size_t ic : {4, 8, 16}) {
  90. for (size_t oc : {4, 40, 80}) {
  91. for (size_t h : {7, 15}) {
  92. for (size_t w : {3, 7}) {
  93. size_t s = 1;
  94. size_t p = 0;
  95. size_t f = 1;
  96. param::BatchConvBias param;
  97. param.nonlineMode = nlmode;
  98. param.format = param::BatchConvBias::Format::NCHW4;
  99. param.sparse = param::BatchConvBias::Sparse::DENSE;
  100. param.pad_h = param.pad_w = p;
  101. param.stride_h = param.stride_w = s;
  102. args.emplace_back(
  103. param, TensorShape{b, ic / 4, h, w, 4},
  104. TensorShape{b, oc, ic / 4, f, f, 4},
  105. TensorShape{1, oc / 4, 1, 1, 4});
  106. }
  107. }
  108. }
  109. }
  110. }
  111. }
  112. return args;
  113. }
  114. void check_batch_conv_bias(
  115. DType src_dtype, DType filter_dtype, DType bias_dtype, DType dst_dtype,
  116. Handle* handle, const char* algo, const std::vector<TestArg>& args) {
  117. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  118. Checker<BatchConvBiasForward> checker(handle);
  119. if (algo) {
  120. checker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(algo));
  121. }
  122. std::unique_ptr<RNG> rng;
  123. std::unique_ptr<RNG> bias_rng;
  124. std::unique_ptr<RNG> const_rng;
  125. // TODO: check range of rng
  126. if (src_dtype.enumv() == DTypeEnum::QuantizedS8) {
  127. rng = std::make_unique<UniformIntRNG>(-3, 3);
  128. const_rng = std::make_unique<UniformIntRNG>(1, 1);
  129. megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32);
  130. bias_rng = std::make_unique<UniformIntRNG>(-50, 50);
  131. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1).set_max_avg_biased_error(
  132. 1e-1);
  133. } else if (src_dtype.enumv() == DTypeEnum::Float16) {
  134. rng = std::make_unique<NormalRNG>(2.f);
  135. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float16);
  136. bias_rng = std::make_unique<NormalRNG>(2.f);
  137. checker.set_epsilon(1e-2);
  138. } else if (src_dtype.enumv() == DTypeEnum::Float32) {
  139. rng = std::make_unique<NormalRNG>(2.f);
  140. megdnn_assert(bias_dtype.enumv() == DTypeEnum::Float32);
  141. bias_rng = std::make_unique<NormalRNG>(2.f);
  142. }
  143. megdnn_assert(rng != nullptr && bias_rng != nullptr);
  144. checker.set_rng(0, rng.get())
  145. .set_rng(1, rng.get())
  146. .set_rng(2, rng.get())
  147. .set_rng(3, rng.get());
  148. for (auto&& arg : args) {
  149. checker.set_dtype(0, src_dtype)
  150. .set_dtype(1, filter_dtype)
  151. .set_dtype(2, bias_dtype)
  152. .set_dtype(4, dst_dtype)
  153. .set_param(arg.param)
  154. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  155. }
  156. }
  157. #if MEGDNN_WITH_BENCHMARK
  158. struct BenchArgs {
  159. size_t n, ci, hi, wi, co, f, s;
  160. };
  161. std::vector<BenchArgs> get_facerec_bench_args(size_t batch = 64) {
  162. std::vector<BenchArgs> args;
  163. args.emplace_back(BenchArgs{1, 4096, 64, 64, 4096, 1, 1});
  164. args.emplace_back(BenchArgs{batch, 128, 24, 24, 128, 1, 1});
  165. args.emplace_back(BenchArgs{batch, 256, 12, 12, 256, 1, 1});
  166. args.emplace_back(BenchArgs{batch, 512, 6, 6, 512, 1, 1});
  167. args.emplace_back(BenchArgs{batch, 1024, 4, 2, 1024, 1, 1});
  168. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 1});
  169. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 1});
  170. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 1});
  171. args.emplace_back(BenchArgs{batch, 108, 32, 32, 192, 1, 2});
  172. args.emplace_back(BenchArgs{batch, 192, 16, 16, 192, 1, 1});
  173. args.emplace_back(BenchArgs{batch, 192, 16, 16, 384, 1, 2});
  174. args.emplace_back(BenchArgs{batch, 384, 8, 8, 384, 1, 1});
  175. args.emplace_back(BenchArgs{batch, 384, 8, 8, 640, 1, 2});
  176. args.emplace_back(BenchArgs{batch, 640, 4, 4, 640, 1, 1});
  177. return args;
  178. }
  179. void benchmark_target_algo(
  180. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  181. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  182. const char* algo = nullptr,
  183. param::BatchConvBias::Format format = param::BatchConvBias::Format::NCHW4) {
  184. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  185. megdnn_assert(format == param::BatchConvBias::Format::NCHW4);
  186. CUBenchmarker<BatchConvBiasForward> benchmarker(handle);
  187. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  188. CUBenchmarker<BatchedMatrixMul> benchmarker_matmul(handle);
  189. size_t RUNS = 1000;
  190. benchmarker.set_display(false).set_times(RUNS);
  191. benchmarker_cudnn.set_display(false).set_times(RUNS);
  192. benchmarker_matmul.set_display(false).set_times(RUNS);
  193. std::unique_ptr<OprProxy<BatchConvBiasForward>> proxy{
  194. new OprProxy<BatchConvBiasForward>{true}};
  195. if (algo) {
  196. benchmarker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(algo));
  197. } else {
  198. benchmarker.set_proxy(proxy);
  199. }
  200. #define V1(x) #x
  201. #define V(x) V1(x)
  202. #define CUDNN_VERSION_STRING \
  203. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  204. benchmarker_cudnn.set_before_exec_callback(
  205. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  206. ConvBiasForward::algo_name<ConvBias::DefaultParam>(
  207. "CUDNN:ConvBiasActivation:"
  208. "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_"
  209. "GEMM" CUDNN_VERSION_STRING,
  210. {})
  211. .c_str()));
  212. benchmarker_matmul.set_before_exec_callback(AlgoChecker<BatchedMatrixMul>(
  213. ExecutionPolicyAlgoName{"BRUTE_FORCE", {{"CUBLAS", {}}}}));
  214. benchmarker.set_dtype(0, src_dtype)
  215. .set_dtype(1, filter_dtype)
  216. .set_dtype(2, bias_dtype)
  217. .set_dtype(3, dst_dtype)
  218. .set_dtype(4, dst_dtype);
  219. benchmarker_cudnn.set_dtype(0, src_dtype)
  220. .set_dtype(1, filter_dtype)
  221. .set_dtype(2, bias_dtype)
  222. .set_dtype(3, dst_dtype)
  223. .set_dtype(4, dst_dtype);
  224. benchmarker_matmul.set_dtype(0, src_dtype)
  225. .set_dtype(1, filter_dtype)
  226. .set_dtype(2, bias_dtype);
  227. using Param = ConvBias::Param;
  228. using Format = Param::Format;
  229. if (format == Format::NCHW4) {
  230. for (auto&& arg : args) {
  231. ConvBias::Param param;
  232. param.pad_h = param.pad_w = arg.f / 2;
  233. param.stride_h = param.stride_w = arg.s;
  234. param.format = Format::NCHW4;
  235. BatchConvBias::Param bparam;
  236. bparam.pad_h = bparam.pad_w = arg.f / 2;
  237. bparam.stride_h = bparam.stride_w = arg.s;
  238. bparam.format = Format::NCHW4;
  239. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  240. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  241. benchmarker.set_param(bparam);
  242. if (!algo) {
  243. benchmarker.proxy()->target_execution_policy.algo.reset();
  244. }
  245. auto time_in_ms = benchmarker.execs(
  246. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  247. {arg.n, arg.co, arg.ci / 4, arg.f, arg.f, 4},
  248. {1, arg.co / 4, 1, 1, 4},
  249. {},
  250. {}}) /
  251. RUNS;
  252. benchmarker_cudnn.set_param(param);
  253. auto time_in_ms_cudnn = benchmarker_cudnn.execs(
  254. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  255. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  256. {1, arg.co / 4, 1, 1, 4},
  257. {},
  258. {}}) /
  259. RUNS;
  260. auto time_in_ms_matmul = benchmarker_matmul.execs(
  261. {{arg.n, arg.co, arg.ci * arg.f * arg.f},
  262. {arg.n, arg.ci * arg.f * arg.f, ho * wo},
  263. {}}) /
  264. RUNS;
  265. float flo =
  266. 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  267. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  268. filter{arg.co, arg.ci, arg.f, arg.f};
  269. if (!algo) {
  270. algo = "no_name";
  271. }
  272. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  273. "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f "
  274. "%.2fTops, "
  275. "perf(algo=%s)/perf(cudnn)=%.2f\n, "
  276. "perf(algo=%s)/perf(batched_matmul)=%.2f\n",
  277. src.to_string().c_str(), filter.to_string().c_str(), algo,
  278. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  279. (flo / (time_in_ms_cudnn * 1e-3)), time_in_ms_matmul,
  280. (flo / (time_in_ms_matmul * 1e-3)), algo,
  281. time_in_ms_cudnn / time_in_ms, algo, time_in_ms_matmul / time_in_ms);
  282. }
  283. }
  284. }
  285. #endif
  286. } // namespace
  287. TEST_F(CUDA, BATCH_CONV_BIAS_QS8) {
  288. require_compute_capability(6, 1);
  289. Checker<BatchConvBiasForward> checker(handle_cuda());
  290. checker.set_before_exec_callback(AlgoChecker<BatchConvBiasForward>(
  291. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD"));
  292. UniformIntRNG const_rng{1, 1};
  293. UniformIntRNG rng{-5, 5};
  294. UniformIntRNG bias_rng{-50, 50};
  295. checker.set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_rng(2, &rng)
  298. .set_rng(3, &rng)
  299. .set_dtype(0, dtype::QuantizedS8{1.2f})
  300. .set_dtype(1, dtype::QuantizedS8{1.3f})
  301. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  302. .set_dtype(3, dtype::QuantizedS8{1.1f})
  303. .set_dtype(4, dtype::QuantizedS8{1.1f})
  304. .set_epsilon(1 + 1e-3)
  305. .set_max_avg_error(1e-1)
  306. .set_max_avg_biased_error(1e-1);
  307. param::BatchConvBias param;
  308. param.pad_h = 2, param.pad_w = 1;
  309. param.stride_h = 1, param.stride_w = 2;
  310. param.format = param::BatchConvBias::Format::NCHW4;
  311. checker.set_param(param).execs(
  312. {{32, 4, 24, 24, 4}, {32, 32, 4, 1, 1, 4}, {1, 8, 1, 1, 4}, {}, {}});
  313. }
  314. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM) {
  315. require_compute_capability(6, 1);
  316. check_batch_conv_bias(
  317. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  318. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  319. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD", get_int8_nchw4_args_gemm());
  320. }
  321. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_GEMM_CHECK_BOUNDS) {
  322. require_compute_capability(6, 1);
  323. check_batch_conv_bias(
  324. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  325. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  326. "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD",
  327. get_int8_nchw4_args_gemm_check_bounds());
  328. }
  329. TEST_F(CUDA, BATCH_CONV_BIAS_QS8_IMPLICIT_GEMM) {
  330. require_compute_capability(6, 1);
  331. check_batch_conv_bias(
  332. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  333. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  334. "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD",
  335. get_int8_nchw4_args(1));
  336. }
  337. #if MEGDNN_WITH_BENCHMARK
  338. TEST_F(CUDA, BENCHMARK_BATCH_CONV_BIAS_QS8) {
  339. require_compute_capability(6, 1);
  340. benchmark_target_algo(
  341. handle_cuda(), get_facerec_bench_args(128), dtype::QuantizedS8{1.2f},
  342. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  343. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::NCHW4);
  344. }
  345. #endif
  346. } // namespace test
  347. } // namespace megdnn
  348. // vim: syntax=cpp.doxygen