You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 6.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. /**
  2. * \file dnn/src/cuda/matrix_mul/algos.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./algos.h"
  12. #include "src/cuda/utils.h"
  13. #include "src/common/algo_base.h"
  14. #include <cuda.h>
  15. #if CUDA_VERSION >= 10010
  16. #include <cublasLt.h>
  17. #endif
  18. using namespace megdnn;
  19. using namespace cuda;
  20. MatrixMulForwardImpl::AlgoPack::AlgoPack() {
  21. all_algos.push_back(&cublas);
  22. #if CUDA_VERSION >= 10000
  23. all_algos.push_back(&wmma_uint4x4x32);
  24. #endif
  25. #if CUDA_VERSION >= 10010
  26. all_algos.push_back(&cublas_lt);
  27. #endif
  28. #if !MEGDNN_DISABLE_FLOAT16
  29. all_algos.push_back(&bfloat16);
  30. #endif
  31. #if CUDA_VERSION >= 9020
  32. fill_cutlass_algos();
  33. for (auto&& algo : simt_float32) {
  34. all_algos.push_back(&algo);
  35. }
  36. for (auto&& algo : simt_float32_split_k) {
  37. all_algos.push_back(&algo);
  38. }
  39. for (auto&& algo : simt_float32_gemv_batched_strided) {
  40. all_algos.push_back(&algo);
  41. }
  42. for (auto&& algo : tensorop_float16) {
  43. all_algos.push_back(&algo);
  44. }
  45. for (auto&& algo : tensorop_float16_split_k) {
  46. all_algos.push_back(&algo);
  47. }
  48. #endif
  49. all_algos.push_back(&naive);
  50. for (auto&& algo : all_algos) {
  51. m_all_algos_map.emplace(algo->info().desc, algo);
  52. }
  53. }
  54. #if CUDA_VERSION >= 9020
  55. void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
  56. using AlgoParam = AlgoCutlassMatrixMulBase::AlgoParam;
  57. simt_float32.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8});
  58. simt_float32.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8});
  59. simt_float32.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8});
  60. simt_float32.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8});
  61. simt_float32.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8});
  62. simt_float32.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8});
  63. simt_float32.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8});
  64. simt_float32.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8});
  65. simt_float32.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8});
  66. simt_float32.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8});
  67. simt_float32.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8});
  68. simt_float32.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8});
  69. simt_float32.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8});
  70. simt_float32.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8});
  71. simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
  72. simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
  73. simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
  74. simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8});
  75. simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8});
  76. simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8});
  77. simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8});
  78. simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8});
  79. simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8});
  80. simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8});
  81. simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8});
  82. simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8});
  83. simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8});
  84. simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8});
  85. simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8});
  86. simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8});
  87. simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8});
  88. simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
  89. simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
  90. simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
  91. simt_float32_gemv_batched_strided.emplace_back(128);
  92. simt_float32_gemv_batched_strided.emplace_back(64);
  93. simt_float32_gemv_batched_strided.emplace_back(32);
  94. #define FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) \
  95. cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \
  96. cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \
  97. cb(128, 128, 32, 64, 64, 32, 8, 8, 4); \
  98. cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \
  99. cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \
  100. cb(128, 128, 32, 64, 64, 32, 16, 8, 8);
  101. #define cb(...) \
  102. tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \
  103. tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__});
  104. FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb)
  105. #undef cb
  106. #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES
  107. }
  108. #endif
  109. MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack;
  110. MEGDNN_DEF_GET_ALGO_FROM_DESC(MatrixMulForwardImpl)
  111. MatrixMulForwardImpl::AlgoBase::SizeArgs::SizeArgs(MatrixMulForwardImpl* o,
  112. const TensorLayout& A,
  113. const TensorLayout& B,
  114. const TensorLayout& C)
  115. : opr{o}, layout_a{A}, layout_b{B}, layout_c{C} {}
  116. MatrixMulForwardImpl::AlgoBase::ExecArgs::ExecArgs(MatrixMulForwardImpl* opr,
  117. _megdnn_tensor_in A,
  118. _megdnn_tensor_in B,
  119. _megdnn_tensor_out C,
  120. _megdnn_workspace workspace)
  121. : SizeArgs(opr, A.layout, B.layout, C.layout),
  122. tensor_a{A},
  123. tensor_b{B},
  124. tensor_c{C},
  125. workspace{workspace} {}
  126. std::string MatrixMulForwardImpl::AlgoBase::SizeArgs::to_string() const {
  127. auto&& param = opr->param();
  128. size_t m = layout_a.shape[0], n = layout_b.shape[1],
  129. k = layout_a.shape[param.transposeA ? 0 : 1];
  130. MEGDNN_MARK_USED_VAR(m);
  131. MEGDNN_MARK_USED_VAR(n);
  132. MEGDNN_MARK_USED_VAR(k);
  133. return ssprintf(
  134. "A={%zux%zu},B={%zux%zu},C={%zux%zu},Transpose A=%d,Transpose "
  135. "B=%d,ldA=%zu,ldB=%zu,ldC=%zu",
  136. m, k, k, n, m, n, param.transposeA, param.transposeB,
  137. layout_a.stride[0], layout_b.stride[0], layout_c.stride[0]);
  138. }
  139. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台