You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.h 6.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. /**
  2. * \file dnn/src/cuda/matrix_mul/algos.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include "megdnn/oprs.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/matrix_mul/opr_impl.h"
  15. #include <cuda.h>
  16. #include <memory>
  17. #if CUDA_VERSION >= 10010
  18. #include <cublasLt.h>
  19. #endif
  20. namespace megdnn {
  21. namespace cuda {
  22. /*!
  23. * \brief base class for matrix mul algos
  24. *
  25. */
  26. class MatrixMulForwardImpl::AlgoBase : public Algorithm {
  27. protected:
  28. ~AlgoBase() = default;
  29. public:
  30. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  31. struct SizeArgs {
  32. MatrixMulForwardImpl* opr;
  33. TensorLayout layout_a, layout_b, layout_c;
  34. std::string to_string() const;
  35. SizeArgs(MatrixMulForwardImpl* opr, const TensorLayout& A,
  36. const TensorLayout& B, const TensorLayout& C);
  37. bool can_be_treated_as_int8x8x32() const {
  38. return layout_a.dtype.enumv() == layout_b.dtype.enumv() &&
  39. (layout_a.dtype.enumv() == DTypeEnum::Int8 ||
  40. layout_a.dtype.enumv() == DTypeEnum::QuantizedS8) &&
  41. (layout_c.dtype.enumv() == DTypeEnum::Int32 ||
  42. layout_c.dtype.enumv() == DTypeEnum::QuantizedS32) &&
  43. opr->param().format == param::MatrixMul::Format::DEFAULT;
  44. }
  45. };
  46. struct ExecArgs : public SizeArgs {
  47. TensorND tensor_a, tensor_b, tensor_c;
  48. Workspace workspace;
  49. ExecArgs(MatrixMulForwardImpl* opr, _megdnn_tensor_in A,
  50. _megdnn_tensor_in B, _megdnn_tensor_out C,
  51. _megdnn_workspace workspace);
  52. };
  53. virtual bool is_available(const SizeArgs& args) const = 0;
  54. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  55. virtual void exec(const ExecArgs& args) const = 0;
  56. bool is_available_wk(const SizeArgs& args, size_t limit) {
  57. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  58. }
  59. bool is_available_reproducible(
  60. const SizeArgs& args, bool reproducible = true,
  61. size_t limit = std::numeric_limits<size_t>::max()) {
  62. return (!reproducible || is_reproducible()) &&
  63. is_available_wk(args, limit);
  64. }
  65. AlgoBase& check_workspace(const SizeArgs& args,
  66. const Workspace& workspace) {
  67. auto req = get_workspace_in_bytes(args);
  68. megdnn_assert(
  69. req <= workspace.size,
  70. "matrix mul fwd algo %s: required workspace %zu bytes, got %zu",
  71. name(), req, workspace.size);
  72. return *this;
  73. }
  74. };
  75. class MatrixMulForwardImpl::AlgoCuBlas final : public AlgoBase {
  76. public:
  77. AlgoCuBlas() = default;
  78. bool is_available(const SizeArgs& args) const override;
  79. size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
  80. return 0_z;
  81. }
  82. const char* name() const override {
  83. return "CUBLAS";
  84. }
  85. void exec(const ExecArgs& args) const override;
  86. bool is_reproducible() const override {
  87. return true;
  88. }
  89. };
  90. #if CUDA_VERSION >= 10000
  91. class MatrixMulForwardImpl::AlgoUInt4x4x32WMMA final : public AlgoBase {
  92. public:
  93. AlgoUInt4x4x32WMMA() = default;
  94. bool is_available(const SizeArgs& args) const override;
  95. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  96. const char* name() const override {
  97. return "UINT4x4x32_WMMA";
  98. }
  99. void exec(const ExecArgs& args) const override;
  100. bool is_reproducible() const override {
  101. return true;
  102. }
  103. };
  104. #endif
  105. #if CUDA_VERSION >= 10010
  106. class MatrixMulForwardImpl::AlgoCuBlasLt final : public AlgoBase {
  107. public:
  108. bool is_available(const SizeArgs& args) const override;
  109. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  110. const char* name() const override {
  111. return "CUBLAS_LT";
  112. }
  113. void exec(const ExecArgs& args) const override;
  114. bool is_reproducible() const override {
  115. return true;
  116. }
  117. };
  118. #endif
  119. class MatrixMulForwardImpl::AlgoNaive final : public AlgoBase {
  120. public:
  121. AlgoNaive() = default;
  122. bool is_available(const SizeArgs& args) const override;
  123. size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
  124. return 0_z;
  125. }
  126. const char* name() const override { return "NAIVE"; }
  127. void exec(const ExecArgs& args) const override;
  128. bool is_reproducible() const override { return true; }
  129. };
  130. #if !MEGDNN_DISABLE_FLOAT16
  131. class MatrixMulForwardImpl::AlgoBFloat16 final : public AlgoBase {
  132. public:
  133. AlgoBFloat16(MatrixMulForwardImpl::AlgoBase*);
  134. bool is_available(const SizeArgs& args) const override;
  135. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  136. const char* name() const override { return m_name.c_str(); }
  137. void exec(const ExecArgs& args) const override;
  138. bool is_reproducible() const override { return true; }
  139. private:
  140. MatrixMulForwardImpl::AlgoBase* m_algorithm = nullptr;
  141. std::string m_name;
  142. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  143. SizeArgs float_args(const SizeArgs& args) const;
  144. };
  145. #endif
  146. class MatrixMulForwardImpl::AlgoPack {
  147. AlgoPack(const AlgoPack&) = delete;
  148. AlgoPack& operator=(const AlgoPack&) = delete;
  149. public:
  150. AlgoPack();
  151. AlgoCuBlas cublas;
  152. AlgoNaive naive;
  153. #if CUDA_VERSION >= 10000
  154. AlgoUInt4x4x32WMMA wmma_uint4x4x32;
  155. #endif
  156. #if CUDA_VERSION >= 10010
  157. AlgoCuBlasLt cublas_lt;
  158. #endif
  159. #if !MEGDNN_DISABLE_FLOAT16
  160. std::unique_ptr<AlgoBFloat16> cublas_bfloat16;
  161. #endif
  162. std::vector<AlgoBase*> all_algos;
  163. };
  164. } // namespace cuda
  165. } // namespace megdnn
  166. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台