You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /**
  2. * \file dnn/src/arm_common/conv_bias/f16/algos.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/arm_common/conv_bias/f16/algos.h"
  12. #include "src/arm_common/conv_bias/direct/multi_thread_common.h"
  13. #include "src/arm_common/conv_bias/f16/direct.h"
  14. #include "src/arm_common/conv_bias/f16/do_conv_stride1.h"
  15. #include "src/arm_common/conv_bias/f16/strategy.h"
  16. #include "src/arm_common/conv_bias/img2col_helper.h"
  17. #include "src/arm_common/conv_bias/postprocess_helper.h"
  18. #include "src/common/opr_delegate.h"
  19. #include "src/fallback/conv_bias/common.h"
  20. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  21. #include "midout.h"
  22. MIDOUT_DECL(megdnn_arm_common_winograd_fp16)
  23. using namespace megdnn;
  24. using namespace arm_common;
  25. /* ======================= AlgoFP16WinogradF23 ======================== */
  26. bool ConvBiasImpl::AlgoFP16WinogradF23::usable(
  27. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  28. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  29. MEGDNN_MARK_USED_VAR(param);
  30. MEGDNN_MARK_USED_VAR(opr);
  31. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 0) {
  32. using Strategy = winograd::winograd_2x3_4x4_f16;
  33. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  34. auto&& matmul_param =
  35. megdnn::winograd::ConvBias<Strategy>(
  36. strategy, m_tile_size, param.nr_threads, param.osz[0],
  37. param.osz[1], param.filter_meta.ocpg)
  38. .get_matmul_kern_param(param);
  39. return m_matmul_algo->usable(matmul_param) &&
  40. (opr->param().format == param::ConvBias::Format::NCHW ||
  41. (opr->param().format ==
  42. param::ConvBias::Format::NCHW_WINOGRAD &&
  43. opr->param().output_block_size == 2 &&
  44. param.winograd_matmul_format ==
  45. param::MatrixMul::Format::DEFAULT)) &&
  46. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  47. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  48. param.filter_meta.spatial[0] == 3) &&
  49. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  50. param.filter_meta.stride[0] == 1) &&
  51. (param.filter_meta.dilation[0] ==
  52. param.filter_meta.dilation[1] &&
  53. param.filter_meta.dilation[0] == 1) &&
  54. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  55. param.src_type.enumv() == DTypeEnum::Float16 &&
  56. param.filter_meta.icpg % 4 == 0 &&
  57. param.filter_meta.ocpg % 4 == 0;
  58. }
  59. MIDOUT_END();
  60. return false;
  61. }
  62. size_t ConvBiasImpl::AlgoFP16WinogradF23::get_workspace(
  63. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  64. MEGDNN_MARK_USED_VAR(param);
  65. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 1) {
  66. winograd::winograd_2x3_4x4_f16 strategy(
  67. param.src_type, param.filter_type, param.dst_type);
  68. return megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_f16>(
  69. strategy, m_tile_size, param.nr_threads, param.osz[0],
  70. param.osz[1], param.filter_meta.ocpg)
  71. .get_workspace_size(param, m_matmul_algo);
  72. }
  73. MIDOUT_END();
  74. return 0;
  75. }
  76. SmallVector<ConvBiasImpl::NCBKern>
  77. ConvBiasImpl::AlgoFP16WinogradF23::dispatch_kerns(
  78. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  79. MEGDNN_MARK_USED_VAR(param);
  80. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 2) {
  81. winograd::winograd_2x3_4x4_f16 strategy(
  82. param.src_type, param.filter_type, param.dst_type);
  83. auto winograd_impl =
  84. megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_f16>(
  85. strategy, m_tile_size, param.nr_threads, param.osz[0],
  86. param.osz[1], param.filter_meta.ocpg);
  87. return winograd_impl.get_kerns(param, m_matmul_algo);
  88. }
  89. MIDOUT_END();
  90. return {};
  91. }
  92. /* ======================= AlgoFP16WinogradF45 ======================== */
  93. bool ConvBiasImpl::AlgoFP16WinogradF45::usable(
  94. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  95. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  96. MEGDNN_MARK_USED_VAR(param);
  97. MEGDNN_MARK_USED_VAR(opr);
  98. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 0) {
  99. using Strategy = winograd::winograd_4x5_1x1_f16;
  100. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  101. auto&& matmul_param =
  102. megdnn::winograd::ConvBias<Strategy>(
  103. strategy, m_tile_size, param.nr_threads, param.osz[0],
  104. param.osz[1], param.filter_meta.ocpg)
  105. .get_matmul_kern_param(param);
  106. return m_matmul_algo->usable(matmul_param) &&
  107. (opr->param().format == param::ConvBias::Format::NCHW ||
  108. (opr->param().format ==
  109. param::ConvBias::Format::NCHW_WINOGRAD &&
  110. opr->param().output_block_size == 4 &&
  111. param.winograd_matmul_format ==
  112. param::MatrixMul::Format::DEFAULT)) &&
  113. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  114. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  115. param.filter_meta.spatial[0] == 5) &&
  116. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  117. param.filter_meta.stride[0] == 1) &&
  118. (param.filter_meta.dilation[0] ==
  119. param.filter_meta.dilation[1] &&
  120. param.filter_meta.dilation[0] == 1) &&
  121. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  122. param.src_type.enumv() == DTypeEnum::Float16;
  123. }
  124. MIDOUT_END();
  125. return false;
  126. }
  127. size_t ConvBiasImpl::AlgoFP16WinogradF45::get_workspace(
  128. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  129. MEGDNN_MARK_USED_VAR(param);
  130. winograd::winograd_4x5_1x1_f16 strategy(param.src_type, param.filter_type,
  131. param.dst_type);
  132. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 1) {
  133. return megdnn::winograd::ConvBias<winograd::winograd_4x5_1x1_f16>(
  134. strategy, m_tile_size, param.nr_threads, param.osz[0],
  135. param.osz[1], param.filter_meta.ocpg)
  136. .get_workspace_size(param, m_matmul_algo);
  137. }
  138. MIDOUT_END();
  139. return 0;
  140. }
  141. SmallVector<ConvBiasImpl::NCBKern>
  142. ConvBiasImpl::AlgoFP16WinogradF45::dispatch_kerns(
  143. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  144. MEGDNN_MARK_USED_VAR(param);
  145. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 2) {
  146. winograd::winograd_4x5_1x1_f16 strategy(
  147. param.src_type, param.filter_type, param.dst_type);
  148. auto winograd_impl =
  149. megdnn::winograd::ConvBias<winograd::winograd_4x5_1x1_f16>(
  150. strategy, m_tile_size, param.nr_threads, param.osz[0],
  151. param.osz[1], param.filter_meta.ocpg);
  152. return winograd_impl.get_kerns(param, m_matmul_algo);
  153. }
  154. MIDOUT_END();
  155. return {};
  156. }
  157. /* ======================= AlgoFP16WinogradF63 ======================== */
  158. bool ConvBiasImpl::AlgoFP16WinogradF63::usable(
  159. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  160. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  161. MEGDNN_MARK_USED_VAR(param);
  162. MEGDNN_MARK_USED_VAR(opr);
  163. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 0) {
  164. using Strategy = winograd::winograd_6x3_1x1_f16;
  165. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  166. auto&& matmul_param =
  167. megdnn::winograd::ConvBias<Strategy>(
  168. strategy, m_tile_size, param.nr_threads, param.osz[0],
  169. param.osz[1], param.filter_meta.ocpg)
  170. .get_matmul_kern_param(param);
  171. return m_matmul_algo->usable(matmul_param) &&
  172. (opr->param().format == param::ConvBias::Format::NCHW ||
  173. (opr->param().format ==
  174. param::ConvBias::Format::NCHW_WINOGRAD &&
  175. opr->param().output_block_size == 6 &&
  176. param.winograd_matmul_format ==
  177. param::MatrixMul::Format::DEFAULT)) &&
  178. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  179. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  180. param.filter_meta.spatial[0] == 3) &&
  181. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  182. param.filter_meta.stride[0] == 1) &&
  183. (param.filter_meta.dilation[0] ==
  184. param.filter_meta.dilation[1] &&
  185. param.filter_meta.dilation[0] == 1) &&
  186. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  187. param.src_type.enumv() == DTypeEnum::Float16;
  188. }
  189. MIDOUT_END();
  190. return false;
  191. }
  192. size_t ConvBiasImpl::AlgoFP16WinogradF63::get_workspace(
  193. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  194. MEGDNN_MARK_USED_VAR(param);
  195. winograd::winograd_6x3_1x1_f16 strategy(param.src_type, param.filter_type,
  196. param.dst_type);
  197. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 1) {
  198. return megdnn::winograd::ConvBias<winograd::winograd_6x3_1x1_f16>(
  199. strategy, m_tile_size, param.nr_threads, param.osz[0],
  200. param.osz[1], param.filter_meta.ocpg)
  201. .get_workspace_size(param, m_matmul_algo);
  202. }
  203. MIDOUT_END();
  204. return 0;
  205. }
  206. SmallVector<ConvBiasImpl::NCBKern>
  207. ConvBiasImpl::AlgoFP16WinogradF63::dispatch_kerns(
  208. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  209. MEGDNN_MARK_USED_VAR(param);
  210. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 2) {
  211. winograd::winograd_6x3_1x1_f16 strategy(
  212. param.src_type, param.filter_type, param.dst_type);
  213. auto winograd_impl =
  214. megdnn::winograd::ConvBias<winograd::winograd_6x3_1x1_f16>(
  215. strategy, m_tile_size, param.nr_threads, param.osz[0],
  216. param.osz[1], param.filter_meta.ocpg);
  217. return winograd_impl.get_kerns(param, m_matmul_algo);
  218. }
  219. MIDOUT_END();
  220. return {};
  221. }
  222. /* ======================= AlgoFP16WinogradF23_8x8 ======================== */
  223. bool ConvBiasImpl::AlgoFP16WinogradF23_8x8::usable(
  224. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  225. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  226. MEGDNN_MARK_USED_VAR(param);
  227. MEGDNN_MARK_USED_VAR(opr);
  228. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 3, 0) {
  229. if (param.filter_meta.icpg % 8 != 0 || param.filter_meta.ocpg % 8 != 0)
  230. return false;
  231. using Strategy = winograd::winograd_2x3_8x8_f16;
  232. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  233. auto&& matmul_param =
  234. megdnn::winograd::ConvBias<Strategy,
  235. param::MatrixMul::Format::MK8>(
  236. strategy, m_tile_size, param.nr_threads, param.osz[0],
  237. param.osz[1], param.filter_meta.ocpg)
  238. .get_matmul_kern_param(param);
  239. return m_matmul_algo->usable(matmul_param) &&
  240. (opr->param().format == param::ConvBias::Format::NCHW ||
  241. (opr->param().format ==
  242. param::ConvBias::Format::NCHW_WINOGRAD &&
  243. opr->param().output_block_size == 2 &&
  244. param.winograd_matmul_format ==
  245. param::MatrixMul::Format::MK8)) &&
  246. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  247. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  248. param.filter_meta.spatial[0] == 3) &&
  249. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  250. param.filter_meta.stride[0] == 1) &&
  251. (param.filter_meta.dilation[0] ==
  252. param.filter_meta.dilation[1] &&
  253. param.filter_meta.dilation[0] == 1) &&
  254. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  255. param.src_type.enumv() == DTypeEnum::Float16;
  256. }
  257. MIDOUT_END();
  258. return false;
  259. }
  260. size_t ConvBiasImpl::AlgoFP16WinogradF23_8x8::get_workspace(
  261. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  262. MEGDNN_MARK_USED_VAR(param);
  263. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 3, 1) {
  264. winograd::winograd_2x3_8x8_f16 strategy(
  265. param.src_type, param.filter_type, param.dst_type);
  266. return megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_f16,
  267. param::MatrixMul::Format::MK8>(
  268. strategy, m_tile_size, param.nr_threads, param.osz[0],
  269. param.osz[1], param.filter_meta.ocpg)
  270. .get_workspace_size(param, m_matmul_algo);
  271. }
  272. MIDOUT_END();
  273. return 0;
  274. }
  275. SmallVector<ConvBiasImpl::NCBKern>
  276. ConvBiasImpl::AlgoFP16WinogradF23_8x8::dispatch_kerns(
  277. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  278. MEGDNN_MARK_USED_VAR(param);
  279. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 2) {
  280. winograd::winograd_2x3_8x8_f16 strategy(
  281. param.src_type, param.filter_type, param.dst_type);
  282. auto winograd_impl =
  283. megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_f16,
  284. param::MatrixMul::Format::MK8>(
  285. strategy, m_tile_size, param.nr_threads, param.osz[0],
  286. param.osz[1], param.filter_meta.ocpg);
  287. return winograd_impl.get_kerns(param, m_matmul_algo);
  288. }
  289. MIDOUT_END();
  290. return {};
  291. }
  292. /*========================from Convolution=============================*/
  293. MIDOUT_DECL(megdnn_arm_common_conv_bias_fp16_kimpl)
  294. bool ConvBiasImpl::AlgoF16Direct::usable(
  295. fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
  296. AlgoSelectionStrategy algo_selection_strategy) const {
  297. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 0, 0) {
  298. auto&& fm = param.filter_meta;
  299. auto FH = fm.spatial[0];
  300. auto SH = fm.stride[0], SW = fm.stride[1];
  301. // the condition ``param.isz[0]*param.isz[1] >= 8'' and
  302. // ``param.osz[0]*param.osz[1] >= 8'' comes from the fact that the
  303. // kernel may have access to up to 8 fp16 after the end of the memory
  304. // chunk.
  305. bool aviliable = fm.format == param::ConvBias::Format::NCHW &&
  306. param.src_type.enumv() == DTypeEnum::Float16 &&
  307. param.filter_type.enumv() == DTypeEnum::Float16 &&
  308. param.dst_type.enumv() == DTypeEnum::Float16 &&
  309. fm.spatial_ndim == 2 && fm.dilation[0] == 1 &&
  310. fm.dilation[1] == 1 &&
  311. param.isz[0] * param.isz[1] >= 8 &&
  312. param.osz[0] * param.osz[1] >= 8 && FH <= 7 &&
  313. SH == 1 && SW == 1;
  314. if (algo_selection_strategy == AlgoSelectionStrategy::HEURISTIC) {
  315. bool large_group = param.filter_meta.group >= param.nr_threads;
  316. aviliable &= (large_group == m_large_group);
  317. }
  318. return aviliable;
  319. }
  320. MIDOUT_END();
  321. return false;
  322. }
  323. size_t ConvBiasImpl::AlgoF16Direct::get_workspace(
  324. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  325. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 0, 1) {
  326. auto wbundle =
  327. MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
  328. param, m_large_group);
  329. return wbundle.total_size_in_bytes();
  330. }
  331. MIDOUT_END();
  332. return 0;
  333. }
  334. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
  335. const NCBKernSizeParam& param) const {
  336. auto fm = param.filter_meta;
  337. size_t N = param.n;
  338. size_t IC = param.filter_meta.icpg;
  339. size_t OC = param.filter_meta.ocpg;
  340. size_t group = fm.group;
  341. WorkspaceBundle wbundle =
  342. MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
  343. param, m_large_group);
  344. SmallVector<NCBKern> ret_kerns;
  345. //! When group >= nr_threads, treat it as large_group, each thread process
  346. //! one group for better performance
  347. if (m_large_group) {
  348. //! Channel wise conv and big groups
  349. auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
  350. const NCBKernIndex& ncb_index) {
  351. auto fm = kern_param.filter_meta;
  352. size_t IC = fm.icpg;
  353. size_t OC = fm.ocpg;
  354. WorkspaceBundle bundle = wbundle;
  355. if (fm.should_flip) {
  356. for (size_t oc = 0; oc < OC; oc++) {
  357. MultithreadDirectConvCommon<dt_float16, __fp16>::
  358. weight_flip_kern(bundle, kern_param, ncb_index,
  359. {ncb_index.thread_id, 0, oc});
  360. }
  361. }
  362. for (size_t ic = 0; ic < IC; ic++) {
  363. MultithreadDirectConvCommon<dt_float16, __fp16>::
  364. copy_padding_kern(bundle, kern_param, ncb_index,
  365. {ncb_index.thread_id, 0, ic});
  366. }
  367. for (size_t oc = 0; oc < OC; oc++) {
  368. MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
  369. bundle, kern_param, ncb_index,
  370. fp16::conv_bias::kern_direct_f16,
  371. {ncb_index.thread_id, 0, oc});
  372. }
  373. };
  374. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  375. } else {
  376. WorkspaceBundle bundle = wbundle;
  377. if (fm.should_flip) {
  378. auto weight_flip = [bundle](const NCBKernParam& kern_param,
  379. const NCBKernIndex& ncb_index) {
  380. MultithreadDirectConvCommon<dt_float16, __fp16>::
  381. weight_flip_kern(bundle, kern_param, ncb_index,
  382. ncb_index.ndrange_id);
  383. };
  384. ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
  385. }
  386. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  387. const NCBKernIndex& ncb_index) {
  388. MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern(
  389. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  390. };
  391. ret_kerns.push_back({copy_padding, {group, N, IC}});
  392. auto do_conv = [bundle](const NCBKernParam& kern_param,
  393. const NCBKernIndex& ncb_index) {
  394. MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
  395. bundle, kern_param, ncb_index,
  396. fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id);
  397. };
  398. ret_kerns.push_back({do_conv, {group, N, OC}});
  399. }
  400. return ret_kerns;
  401. }
  402. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::dispatch_kerns(
  403. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  404. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 0, 1) {
  405. return get_kimpls(param);
  406. }
  407. MIDOUT_END();
  408. return {};
  409. }
  410. /* ===================== stride-1 algo ===================== */
  411. bool ConvBiasImpl::AlgoF16DirectStride1::usable(
  412. fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
  413. AlgoSelectionStrategy algo_selection_strategy) const {
  414. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 1, 0) {
  415. auto&& fm = param.filter_meta;
  416. auto FH = fm.spatial[0];
  417. bool aviliable =
  418. param.filter_meta.format == param::ConvBias::Format::NCHW &&
  419. param.src_type.enumv() == DTypeEnum::Float16 &&
  420. param.filter_type.enumv() == DTypeEnum::Float16 &&
  421. param.dst_type.enumv() == DTypeEnum::Float16 &&
  422. !fm.should_flip && fm.spatial_ndim == 2 &&
  423. fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
  424. fm.stride[0] == 1 && fm.stride[1] == 1 && FH == fm.spatial[1] &&
  425. (FH == 2 || FH == 3 || FH == 5);
  426. if (algo_selection_strategy ==
  427. ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
  428. bool large_group = param.filter_meta.group >= param.nr_threads;
  429. aviliable &= (large_group == m_large_group);
  430. }
  431. return aviliable;
  432. }
  433. MIDOUT_END();
  434. return false;
  435. }
  436. SmallVector<ConvBiasImpl::NCBKern>
  437. ConvBiasImpl::AlgoF16DirectStride1::get_kimpls(
  438. const NCBKernSizeParam& param) const {
  439. auto fm = param.filter_meta;
  440. auto FH = fm.spatial[0];
  441. size_t N = param.n;
  442. size_t IC = param.filter_meta.icpg;
  443. size_t OC = param.filter_meta.ocpg;
  444. size_t group = fm.group;
  445. using Func = std::function<void(const __fp16*, const __fp16*, __fp16*,
  446. size_t, size_t, size_t, size_t, size_t)>;
  447. Func conv_kern_function = nullptr;
  448. #define SWITCH_KERN() \
  449. switch (FH) { \
  450. case 2: \
  451. conv_kern_function = fp16::conv_stride1::do_conv_2x2_stride1; \
  452. break; \
  453. case 3: \
  454. conv_kern_function = fp16::conv_stride1::do_conv_3x3_stride1; \
  455. break; \
  456. case 5: \
  457. conv_kern_function = fp16::conv_stride1::do_conv_5x5_stride1; \
  458. break; \
  459. }
  460. SWITCH_KERN();
  461. WorkspaceBundle wbundle =
  462. MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride(
  463. param, m_large_group);
  464. SmallVector<NCBKern> ret_kerns;
  465. //! When group >= nr_threads, treat it as large_group, each thread process
  466. //! one group for better performance
  467. if (m_large_group) {
  468. //! Channel wise conv and big groups
  469. auto exec_one_group = [wbundle, conv_kern_function](
  470. const NCBKernParam& kern_param,
  471. const NCBKernIndex& ncb_index) {
  472. auto fm = kern_param.filter_meta;
  473. size_t IC = fm.icpg;
  474. size_t OC = fm.ocpg;
  475. WorkspaceBundle bundle = wbundle;
  476. for (size_t ic = 0; ic < IC; ic++) {
  477. MultithreadDirectConvCommon<dt_float16, __fp16>::
  478. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  479. {ncb_index.thread_id, 0, ic});
  480. }
  481. for (size_t oc = 0; oc < OC; oc++) {
  482. MultithreadDirectConvCommon<dt_float16, __fp16>::
  483. do_conv_kern_stride(bundle, kern_param, ncb_index,
  484. conv_kern_function,
  485. {ncb_index.thread_id, 0, oc});
  486. }
  487. };
  488. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  489. } else {
  490. WorkspaceBundle bundle = wbundle;
  491. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  492. const NCBKernIndex& ncb_index) {
  493. MultithreadDirectConvCommon<dt_float16, __fp16>::
  494. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  495. ncb_index.ndrange_id);
  496. };
  497. ret_kerns.push_back({copy_padding, {group, N, IC}});
  498. auto do_conv = [bundle, conv_kern_function](
  499. const NCBKernParam& kern_param,
  500. const NCBKernIndex& ncb_index) {
  501. MultithreadDirectConvCommon<dt_float16, __fp16>::
  502. do_conv_kern_stride(bundle, kern_param, ncb_index,
  503. conv_kern_function,
  504. ncb_index.ndrange_id);
  505. };
  506. ret_kerns.push_back({do_conv, {group, N, OC}});
  507. }
  508. return ret_kerns;
  509. }
  510. size_t ConvBiasImpl::AlgoF16DirectStride1::get_workspace(
  511. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  512. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 1, 1) {
  513. auto bundle = MultithreadDirectConvCommon<
  514. dt_float16, __fp16>::get_bundle_stride(param, m_large_group);
  515. return bundle.total_size_in_bytes();
  516. }
  517. MIDOUT_END();
  518. return 0;
  519. }
  520. SmallVector<ConvBiasImpl::NCBKern>
  521. ConvBiasImpl::AlgoF16DirectStride1::dispatch_kerns(
  522. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  523. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 1, 2) {
  524. return get_kimpls(param);
  525. }
  526. MIDOUT_END();
  527. return {};
  528. }
  529. #endif
  530. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台