You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /**
  2. * \file dnn/src/cuda/conv_bias/algo.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "src/cuda/conv_bias/algo.h"
  13. #include "src/cuda/utils.h"
  14. using namespace megdnn;
  15. using namespace cuda;
  16. ConvBiasForwardImpl::AlgoPack::AlgoPack() {
  17. non_cudnn_algos.push_back(&chanwise);
  18. non_cudnn_algos.push_back(&chanwise_small);
  19. non_cudnn_algos.push_back(&inplace_matmul);
  20. non_cudnn_algos.push_back(&matmul);
  21. non_cudnn_algos.push_back(&matmul8x8x32);
  22. non_cudnn_algos.push_back(&batched_matmul);
  23. non_cudnn_algos.push_back(&a1x1);
  24. fill_cudnn_algos();
  25. for (auto&& algo : cudnn_conv_bias_activations) {
  26. all_algos.push_back(&algo);
  27. }
  28. //! add conv+nonlinear algos
  29. std::vector<AlgoBase*> conv_algos;
  30. conv_algos.push_back(&chanwise);
  31. conv_algos.push_back(&chanwise_small);
  32. conv_algos.push_back(&chanwise8x8x32);
  33. for (auto&& algo : cudnn_convs) {
  34. conv_algos.push_back(&algo);
  35. }
  36. conv_algos.push_back(&inplace_matmul);
  37. conv_algos.push_back(&matmul);
  38. conv_algos.push_back(&matmul8x8x32);
  39. conv_algos.push_back(&batched_matmul);
  40. conv_algos.push_back(&a1x1);
  41. conv_algos.reserve(conv_algos.size() * 2);
  42. //! add gconv algos by AlgoGroupConvGeneral
  43. size_t algo_size = conv_algos.size();
  44. for (size_t i = 3; i < algo_size; ++i) {
  45. gconv_refhold.emplace_back(new AlgoGroupConvGeneral(conv_algos[i]));
  46. algo2gconv[conv_algos[i]] = gconv_refhold.back().get();
  47. conv_algos.push_back(gconv_refhold.back().get());
  48. }
  49. for (auto&& algo : conv_algos) {
  50. all_algos.push_back(algo);
  51. }
  52. non_cudnn_algos.push_back(all_algos.rbegin()[4]); // group inplace_matmul
  53. non_cudnn_algos.push_back(all_algos.rbegin()[3]); // group matmul
  54. non_cudnn_algos.push_back(all_algos.rbegin()[2]); // group matmul_8x8x32
  55. non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group batched_matmul
  56. non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1
  57. algo_size = all_algos.size();
  58. for (size_t i = 0; i < algo_size; ++i) {
  59. bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i]));
  60. all_algos.push_back(bfloat16_refhold.back().get());
  61. bfloat16_algos.push_back(bfloat16_refhold.back().get());
  62. }
  63. size_t all_algo_size = all_algos.size();
  64. #if CUDA_VERSION >= 10000
  65. fill_imma_algos();
  66. all_algos.push_back(&wmma_quint4x4x32);
  67. for (auto&& algo : int8_nchw4_imma) {
  68. all_algos.push_back(&algo);
  69. }
  70. for (auto&& algo : int8_chwn4_imma) {
  71. all_algos.push_back(&algo);
  72. }
  73. for (auto&& algo : int8_chwn4_imma_reorder_filter) {
  74. all_algos.push_back(&algo);
  75. }
  76. for (auto&& algo : int8_chwn4_imma_unroll_width) {
  77. all_algos.push_back(&algo);
  78. }
  79. #if CUDA_VERSION >= 10020
  80. for (auto&& algo : int8_nchw32_imma) {
  81. all_algos.push_back(&algo);
  82. }
  83. #endif
  84. #endif
  85. fill_dp4a_algos();
  86. for (auto&& algo : int8_nchw4_dotprod) {
  87. all_algos.push_back(&algo);
  88. }
  89. all_algos.push_back(&int8_chwn4_dotprod);
  90. for (size_t i = all_algo_size; i < all_algos.size(); ++i) {
  91. non_cudnn_algos.push_back(all_algos[i]);
  92. }
  93. }
  94. ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack;
  95. ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(
  96. ConvBiasForwardImpl* o, const TensorLayout& src,
  97. const TensorLayout& filter, const TensorLayout& bias,
  98. const TensorLayout& z, const TensorLayout& dst,
  99. const PreprocessedFilter* preprocessed_filter)
  100. : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias,
  101. z, dst, preprocessed_filter) {}
  102. ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(
  103. ConvBiasForwardImpl* o, const TensorLayout& src,
  104. const TensorLayout& filter, const CanonizedFilterMeta& filter_meta,
  105. const TensorLayout& bias, const TensorLayout& z,
  106. const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter)
  107. : BiasForwardSizeArgs{concrete_handle(o->handle()),
  108. &src,
  109. &filter,
  110. &bias,
  111. &z,
  112. filter_meta,
  113. &dst,
  114. o->param().nonlineMode},
  115. opr{o},
  116. preprocessed_filter{preprocessed_filter} {}
  117. ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs(
  118. ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
  119. _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z,
  120. _megdnn_tensor_out dst, _megdnn_workspace workspace,
  121. const PreprocessedFilter* preprocessed_filter)
  122. : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout,
  123. dst.layout, preprocessed_filter),
  124. src_tensor{&src},
  125. filter_tensor{&filter},
  126. bias_tensor{&bias},
  127. z_tensor{&z},
  128. dst_tensor{&dst},
  129. workspace{workspace} {}
  130. std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const {
  131. auto&& fm = filter_meta;
  132. MEGDNN_MARK_USED_VAR(fm);
  133. std::string nonlinear_mode_str;
  134. switch (nonlinear_mode) {
  135. case param::ConvBias::NonlineMode::RELU:
  136. nonlinear_mode_str = "RELU";
  137. break;
  138. case param::ConvBias::NonlineMode::SIGMOID:
  139. nonlinear_mode_str = "SIGMOID";
  140. break;
  141. case param::ConvBias::NonlineMode::IDENTITY:
  142. nonlinear_mode_str = "IDENTITY";
  143. break;
  144. default:
  145. megdnn_throw("invalid conv bias nonlinear mode");
  146. }
  147. return megdnn_mangle(ssprintf(
  148. "src=%s, filter=%u{%u,%u,%u,%u}, dst=%s, "
  149. "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s, "
  150. "nonlinear_mode=%s",
  151. src_layout->to_string().c_str(), fm.group, fm.ocpg, fm.icpg,
  152. fm.spatial[0], fm.spatial[1], dst_layout->to_string().c_str(),
  153. fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
  154. fm.dilation[0], fm.dilation[1], !fm.should_flip,
  155. src_layout->dtype.name(), dst_layout->dtype.name(),
  156. nonlinear_mode_str.c_str()));
  157. }
  158. void ConvBiasForwardImpl::AlgoPack::fill_cudnn_algos() {
  159. #define V1(v) #v
  160. #define V(v) V1(v)
  161. #define DEF_ALGO(NAME, REPROD) \
  162. cudnn_conv_bias_activations.push_back( \
  163. {REPROD, \
  164. "CUDNN:ConvBiasActivation:" #NAME \
  165. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL), \
  166. NAME}); \
  167. cudnn_convs.push_back( \
  168. {REPROD, \
  169. "CUDNN:Convolution:" #NAME \
  170. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL), \
  171. NAME})
  172. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true);
  173. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true);
  174. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true);
  175. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true);
  176. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true);
  177. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true);
  178. #if CUDNN_MAJOR >= 5
  179. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true);
  180. #if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1
  181. DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true);
  182. #endif
  183. #endif
  184. #if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1)
  185. #pragma message "not latest cudnn"
  186. #endif
  187. #undef DEF_ALGO
  188. #undef V
  189. #undef V1
  190. }
  191. #if CUDA_VERSION >= 10000
  192. void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
  193. int8_chwn4_imma.push_back(
  194. {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA16x16x16});
  195. int8_chwn4_imma.push_back(
  196. {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA32x8x16});
  197. int8_chwn4_imma.push_back(
  198. {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA8x32x16});
  199. int8_nchw4_imma.push_back(
  200. {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA16x16x16});
  201. int8_nchw4_imma.push_back(
  202. {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA32x8x16});
  203. int8_nchw4_imma.push_back(
  204. {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA8x32x16});
  205. int8_chwn4_imma_reorder_filter.push_back(
  206. {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize::
  207. IMMA16x16x16});
  208. int8_chwn4_imma_reorder_filter.push_back(
  209. {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize::
  210. IMMA32x8x16});
  211. int8_chwn4_imma_reorder_filter.push_back(
  212. {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize::
  213. IMMA8x32x16});
  214. int8_chwn4_imma_unroll_width.push_back(
  215. {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize::
  216. IMMA16x16x16});
  217. int8_chwn4_imma_unroll_width.push_back(
  218. {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize::
  219. IMMA32x8x16});
  220. int8_chwn4_imma_unroll_width.push_back(
  221. {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize::
  222. IMMA8x32x16});
  223. #if CUDA_VERSION >= 10020
  224. {
  225. using AlgoParam = AlgoInt8NCHW32IMMAImplicitGemm::AlgoParam;
  226. int8_nchw32_imma.emplace_back(AlgoParam{128, 256, 64, 64, 64, 64});
  227. int8_nchw32_imma.emplace_back(AlgoParam{256, 128, 64, 64, 64, 64});
  228. int8_nchw32_imma.emplace_back(AlgoParam{128, 128, 64, 64, 64, 64});
  229. int8_nchw32_imma.emplace_back(AlgoParam{64, 128, 64, 32, 64, 64});
  230. int8_nchw32_imma.emplace_back(AlgoParam{128, 64, 64, 64, 32, 64});
  231. int8_nchw32_imma.emplace_back(AlgoParam{64, 64, 64, 32, 32, 64});
  232. int8_nchw32_imma.emplace_back(AlgoParam{32, 64, 64, 32, 16, 64});
  233. }
  234. #endif
  235. }
  236. #endif
  237. void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
  238. using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam;
  239. int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32});
  240. int8_nchw4_dotprod.emplace_back(AlgoParam{128, 64, 32, 64, 32, 32});
  241. int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32});
  242. int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32});
  243. int8_nchw4_dotprod.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32});
  244. int8_nchw4_dotprod.emplace_back(AlgoParam{64, 64, 32, 64, 32, 32});
  245. int8_nchw4_dotprod.emplace_back(AlgoParam{32, 64, 32, 32, 64, 32});
  246. int8_nchw4_dotprod.emplace_back(AlgoParam{64, 32, 32, 64, 32, 32});
  247. int8_nchw4_dotprod.emplace_back(AlgoParam{32, 32, 32, 32, 32, 32});
  248. int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
  249. }
  250. ConvBiasForwardImpl::AlgoBase*
  251. ConvBiasForwardImpl::AlgoPack::cudnn_conv_from_enum(
  252. cudnnConvolutionFwdAlgo_t algo) {
  253. for (auto&& i : cudnn_convs) {
  254. if (i.cudnn_enum() == algo)
  255. return &i;
  256. }
  257. megdnn_throw(
  258. megdnn_mangle(ssprintf("can not find cudnn conv fwd algorithm %d",
  259. static_cast<int>(algo))));
  260. }
  261. ConvBiasForwardImpl::AlgoBase*
  262. ConvBiasForwardImpl::AlgoPack::cudnn_conv_bias_act_from_enum(
  263. cudnnConvolutionFwdAlgo_t algo) {
  264. for (auto&& i : cudnn_conv_bias_activations) {
  265. if (i.cudnn_enum() == algo)
  266. return &i;
  267. }
  268. megdnn_throw(megdnn_mangle(
  269. ssprintf("can not find cudnn conv bias act algorithm %d",
  270. static_cast<int>(algo))));
  271. }
  272. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台