You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_operation.h 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. /***************************************************************************************************
  2. * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. *modification, are permitted provided that the following conditions are met:
  6. * * Redistributions of source code must retain the above copyright notice,
  7. *this list of conditions and the following disclaimer.
  8. * * Redistributions in binary form must reproduce the above copyright
  9. *notice, this list of conditions and the following disclaimer in the
  10. *documentation and/or other materials provided with the distribution.
  11. * * Neither the name of the NVIDIA CORPORATION nor the names of its
  12. *contributors may be used to endorse or promote products derived from this
  13. *software without specific prior written permission.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
  19. *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  20. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21. *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
  23. *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  24. *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. *
  26. **************************************************************************************************/
  27. /**
  28. * \file dnn/src/cuda/cutlass/convolution_operation.h
  29. *
  30. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  31. *
  32. * Unless required by applicable law or agreed to in writing,
  33. * software distributed under the License is distributed on an
  34. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  35. * implied.
  36. */
  37. #pragma once
  38. #include "cutlass/convolution/device/convolution.h"
  39. #include "src/cuda/cutlass/library_internal.h"
  40. ///////////////////////////////////////////////////////////////////////////////////////////////////
  41. namespace cutlass {
  42. namespace library {
  43. ///////////////////////////////////////////////////////////////////////////////////////////////////
  44. template <typename Operator_>
  45. class ConvolutionOperationBase : public Operation {
  46. public:
  47. using Operator = Operator_;
  48. using ElementSrc = typename Operator::ElementSrc;
  49. using LayoutSrc = typename Operator::LayoutSrc;
  50. using ElementFilter = typename Operator::ElementFilter;
  51. using LayoutFilter = typename Operator::LayoutFilter;
  52. using ElementDst = typename Operator::ElementDst;
  53. using LayoutDst = typename Operator::LayoutDst;
  54. using ElementBias = typename Operator::ElementBias;
  55. using LayoutBias = typename Operator::LayoutBias;
  56. using ElementAccumulator = typename Operator::ElementAccumulator;
  57. ConvolutionOperationBase(char const* name = "unknown_convolution") {
  58. m_description.name = name;
  59. m_description.provider = Provider::kCUTLASS;
  60. m_description.kind = OperationKind::kConvolution;
  61. m_description.conv_op = Operator::kConvolutionalOperator;
  62. m_description.tile_description.threadblock_shape = make_Coord(
  63. Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN,
  64. Operator::ThreadblockShape::kK);
  65. m_description.tile_description.threadblock_stages = Operator::kStages;
  66. m_description.tile_description.warp_count = make_Coord(
  67. Operator::ConvolutionKernel::WarpCount::kM,
  68. Operator::ConvolutionKernel::WarpCount::kN,
  69. Operator::ConvolutionKernel::WarpCount::kK);
  70. m_description.tile_description.math_instruction.instruction_shape = make_Coord(
  71. Operator::InstructionShape::kM, Operator::InstructionShape::kN,
  72. Operator::InstructionShape::kK);
  73. m_description.tile_description.math_instruction.element_accumulator =
  74. NumericTypeMap<ElementAccumulator>::kId;
  75. m_description.tile_description.math_instruction.opcode_class =
  76. OpcodeClassMap<typename Operator::OperatorClass>::kId;
  77. m_description.tile_description.math_instruction.math_operation =
  78. MathOperationMap<typename Operator::Operator>::kId;
  79. m_description.tile_description.minimum_compute_capability =
  80. ArchMap<typename Operator::ArchTag,
  81. typename Operator::OperatorClass>::kMin;
  82. m_description.tile_description.maximum_compute_capability =
  83. ArchMap<typename Operator::ArchTag,
  84. typename Operator::OperatorClass>::kMax;
  85. m_description.src =
  86. make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc);
  87. m_description.filter = make_TensorDescription<ElementFilter, LayoutFilter>(
  88. Operator::kAlignmentFilter);
  89. m_description.dst =
  90. make_TensorDescription<ElementDst, LayoutDst>(Operator::kAlignmentDst);
  91. m_description.bias = make_TensorDescription<ElementBias, LayoutBias>(
  92. Operator::kAlignmentDst);
  93. m_description.convolution_type = Operator::kConvolutionType;
  94. m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId;
  95. m_description.epilogue_type = Operator::EpilogueOutputOp::kType;
  96. m_description.epilogue_count = Operator::EpilogueOutputOp::kCount;
  97. m_description.threadblock_swizzle =
  98. ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId;
  99. m_description.special_optimization = Operator::kSpecialOpt;
  100. m_description.gemm_mode = Operator::kGemmMode;
  101. m_description.without_shared_load = Operator::kWithoutSharedLoad;
  102. }
  103. virtual OperationDescription const& description() const { return m_description; }
  104. protected:
  105. ConvolutionDescription m_description;
  106. };
  107. ///////////////////////////////////////////////////////////////////////////////////////////////////
  108. namespace detail {
  109. template <typename EpilogueOp, epilogue::EpilogueType type>
  110. struct init_epilogue_param_;
  111. template <typename EpilogueOp>
  112. struct init_epilogue_param_<EpilogueOp, epilogue::EpilogueType::kLinearCombination> {
  113. using ElementCompute = typename EpilogueOp::ElementCompute;
  114. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  115. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  116. *static_cast<ElementCompute const*>(conv_args->beta)};
  117. }
  118. };
  119. template <typename EpilogueOp>
  120. struct init_epilogue_param_<
  121. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> {
  122. using ElementCompute = typename EpilogueOp::ElementCompute;
  123. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  124. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  125. *static_cast<ElementCompute const*>(conv_args->beta),
  126. *static_cast<ElementCompute const*>(conv_args->gamma),
  127. *static_cast<ElementCompute const*>(conv_args->delta)};
  128. }
  129. };
  130. template <typename EpilogueOp>
  131. struct init_epilogue_param_<
  132. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombinationClamp> {
  133. using ElementCompute = typename EpilogueOp::ElementCompute;
  134. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  135. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  136. *static_cast<ElementCompute const*>(conv_args->beta),
  137. *static_cast<ElementCompute const*>(conv_args->gamma),
  138. *static_cast<ElementCompute const*>(conv_args->delta)};
  139. }
  140. };
  141. template <typename EpilogueOp>
  142. struct init_epilogue_param_<
  143. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombinationRelu> {
  144. using ElementCompute = typename EpilogueOp::ElementCompute;
  145. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  146. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  147. *static_cast<ElementCompute const*>(conv_args->beta),
  148. *static_cast<ElementCompute const*>(conv_args->gamma),
  149. *static_cast<ElementCompute const*>(conv_args->threshold),
  150. *static_cast<ElementCompute const*>(conv_args->delta),
  151. *static_cast<ElementCompute const*>(conv_args->theta)};
  152. }
  153. };
  154. template <typename EpilogueOp>
  155. struct init_epilogue_param_<
  156. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombinationReluClamp> {
  157. using ElementCompute = typename EpilogueOp::ElementCompute;
  158. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  159. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  160. *static_cast<ElementCompute const*>(conv_args->beta),
  161. *static_cast<ElementCompute const*>(conv_args->gamma),
  162. *static_cast<ElementCompute const*>(conv_args->threshold),
  163. *static_cast<ElementCompute const*>(conv_args->delta),
  164. *static_cast<ElementCompute const*>(conv_args->theta)};
  165. }
  166. };
  167. template <typename EpilogueOp>
  168. struct init_epilogue_param_<
  169. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombinationHSwish> {
  170. using ElementCompute = typename EpilogueOp::ElementCompute;
  171. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  172. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  173. *static_cast<ElementCompute const*>(conv_args->beta),
  174. *static_cast<ElementCompute const*>(conv_args->gamma),
  175. *static_cast<ElementCompute const*>(conv_args->scale),
  176. *static_cast<ElementCompute const*>(conv_args->delta),
  177. *static_cast<ElementCompute const*>(conv_args->theta)};
  178. }
  179. };
  180. template <typename EpilogueOp>
  181. struct init_epilogue_param_<
  182. EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombinationHSwishClamp> {
  183. using ElementCompute = typename EpilogueOp::ElementCompute;
  184. typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
  185. return {*static_cast<ElementCompute const*>(conv_args->alpha),
  186. *static_cast<ElementCompute const*>(conv_args->beta),
  187. *static_cast<ElementCompute const*>(conv_args->gamma),
  188. *static_cast<ElementCompute const*>(conv_args->scale),
  189. *static_cast<ElementCompute const*>(conv_args->delta),
  190. *static_cast<ElementCompute const*>(conv_args->theta)};
  191. }
  192. };
  193. } // namespace detail
  194. template <typename EpilogueOp>
  195. struct init_epilogue_param
  196. : public detail::init_epilogue_param_<EpilogueOp, EpilogueOp::kType> {};
  197. ///////////////////////////////////////////////////////////////////////////////////////////////////
  198. template <typename Operator_>
  199. class ConvolutionOperation : public ConvolutionOperationBase<Operator_> {
  200. public:
  201. using Operator = Operator_;
  202. using ElementSrc = typename Operator::ElementSrc;
  203. using LayoutSrc = typename Operator::LayoutSrc;
  204. using ElementFilter = typename Operator::ElementFilter;
  205. using LayoutFilter = typename Operator::LayoutFilter;
  206. using ElementBias = typename Operator::ElementBias;
  207. using LayoutBias = typename Operator::LayoutBias;
  208. using ElementDst = typename Operator::ElementDst;
  209. using LayoutDst = typename Operator::LayoutDst;
  210. using ElementAccumulator = typename Operator::ElementAccumulator;
  211. using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
  212. using OperatorArguments = typename Operator::Arguments;
  213. ConvolutionOperation(char const* name = "unknown_gemm")
  214. : ConvolutionOperationBase<Operator_>(name) {}
  215. virtual Status run(
  216. void const* arguments_ptr, void* device_workspace = nullptr,
  217. cudaStream_t stream = nullptr) const {
  218. cutlass::conv::Operator conv_op = this->m_description.conv_op;
  219. ConvolutionArguments const* conv_args =
  220. reinterpret_cast<ConvolutionArguments const*>(arguments_ptr);
  221. const auto& ps = conv_args->problem_size;
  222. OperatorArguments args;
  223. args.problem_size = ps;
  224. args.ref_src = {
  225. static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)),
  226. LayoutSrc::packed(implicit_gemm_tensor_a_extent(conv_op, ps))};
  227. args.ref_filter = {
  228. static_cast<ElementFilter*>(const_cast<void*>(conv_args->filter)),
  229. LayoutFilter::packed(implicit_gemm_tensor_b_extent(conv_op, ps))};
  230. args.ref_bias = {
  231. static_cast<ElementBias*>(const_cast<void*>(conv_args->bias)),
  232. LayoutBias::packed(implicit_gemm_tensor_bias_extent(conv_op, ps))};
  233. args.ref_z = {
  234. static_cast<ElementDst*>(const_cast<void*>(conv_args->z)),
  235. LayoutDst::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};
  236. args.ref_dst = {
  237. static_cast<ElementDst*>(conv_args->dst),
  238. LayoutDst::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};
  239. args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get(
  240. conv_args);
  241. if (conv_args->extra_param) {
  242. args.extra_param = *reinterpret_cast<typename Operator::ExtraParam const*>(
  243. conv_args->extra_param);
  244. }
  245. Operator op;
  246. Status status = op.initialize(args, device_workspace);
  247. if (status != Status::kSuccess) {
  248. return status;
  249. }
  250. return op.run(stream);
  251. }
  252. };
  253. ///////////////////////////////////////////////////////////////////////////////////////////////////
  254. /// We add a new template class to handle convolution backward filter operation, because
  255. /// the device-level convolution operator of backward filter is different from the
  256. /// others (convolution forward and convolution backward data).
  257. /// But the description object is reused in this wrapper of convolution backward filter.
  258. /// The reason is that we do not want to introduce an another unnecessary structure.
  259. /// TODO: Maybe the device-level operator in cutlass for convoluton forward, backward
  260. /// data and backward filter should be combined.
  261. template <typename Operator_>
  262. class ConvolutionBackwardFilterOperationBase : public Operation {
  263. public:
  264. using Operator = Operator_;
  265. using ElementSrc = typename Operator::ElementSrc;
  266. using LayoutSrc = typename Operator::LayoutSrc;
  267. using ElementDiff = typename Operator::ElementDiff;
  268. using LayoutDiff = typename Operator::LayoutDiff;
  269. using ElementGrad = typename Operator::ElementGrad;
  270. using LayoutGrad = typename Operator::LayoutGrad;
  271. using ElementAccumulator = typename Operator::ElementAccumulator;
  272. ConvolutionBackwardFilterOperationBase(char const* name = "unknown_convolution") {
  273. m_description.name = name;
  274. m_description.provider = Provider::kCUTLASS;
  275. m_description.kind = OperationKind::kConvolution;
  276. m_description.conv_op = Operator::kConvolutionalOperator;
  277. m_description.tile_description.threadblock_shape = make_Coord(
  278. Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN,
  279. Operator::ThreadblockShape::kK);
  280. m_description.tile_description.threadblock_stages = Operator::kStages;
  281. m_description.tile_description.warp_count = make_Coord(
  282. Operator::ConvolutionKernel::WarpCount::kM,
  283. Operator::ConvolutionKernel::WarpCount::kN,
  284. Operator::ConvolutionKernel::WarpCount::kK);
  285. m_description.tile_description.math_instruction.instruction_shape = make_Coord(
  286. Operator::InstructionShape::kM, Operator::InstructionShape::kN,
  287. Operator::InstructionShape::kK);
  288. m_description.tile_description.math_instruction.element_accumulator =
  289. NumericTypeMap<ElementAccumulator>::kId;
  290. m_description.tile_description.math_instruction.opcode_class =
  291. OpcodeClassMap<typename Operator::OperatorClass>::kId;
  292. m_description.tile_description.math_instruction.math_operation =
  293. MathOperationMap<typename Operator::Operator>::kId;
  294. m_description.tile_description.minimum_compute_capability =
  295. ArchMap<typename Operator::ArchTag,
  296. typename Operator::OperatorClass>::kMin;
  297. m_description.tile_description.maximum_compute_capability =
  298. ArchMap<typename Operator::ArchTag,
  299. typename Operator::OperatorClass>::kMax;
  300. /// src in description -> src in C++ template
  301. m_description.src =
  302. make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc);
  303. /// filter in description -> diff in C++ template
  304. m_description.filter = make_TensorDescription<ElementDiff, LayoutDiff>(
  305. Operator::kAlignmentDiff);
  306. /// dst in description -> grad in C++ template
  307. m_description.dst = make_TensorDescription<ElementGrad, LayoutGrad>(
  308. Operator::kAlignmentGrad);
  309. /// because bias tensor is not used in ConvolutionBackwardFilter operation, the
  310. /// following tensor description is a dummy arguments
  311. m_description.bias = make_TensorDescription<ElementGrad, LayoutGrad>(
  312. Operator::kAlignmentGrad);
  313. m_description.convolution_type = Operator::kConvolutionType;
  314. m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId;
  315. m_description.epilogue_type = Operator::EpilogueOutputOp::kType;
  316. m_description.epilogue_count = Operator::EpilogueOutputOp::kCount;
  317. m_description.threadblock_swizzle =
  318. ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId;
  319. m_description.special_optimization = Operator::kSpecialOpt;
  320. m_description.gemm_mode = Operator::kGemmMode;
  321. /// ConvolutionBackwardFilter operation is only used for depthwise convolution,
  322. /// so the option without_shared_load is always true
  323. m_description.without_shared_load = true;
  324. }
  325. virtual OperationDescription const& description() const { return m_description; }
  326. protected:
  327. ConvolutionDescription m_description;
  328. };
  329. ///////////////////////////////////////////////////////////////////////////////////////////////////
  330. template <typename Operator_>
  331. class ConvolutionBackwardFilterOperation
  332. : public ConvolutionBackwardFilterOperationBase<Operator_> {
  333. public:
  334. using Operator = Operator_;
  335. using ElementSrc = typename Operator::ElementSrc;
  336. using LayoutSrc = typename Operator::LayoutSrc;
  337. using ElementDiff = typename Operator::ElementDiff;
  338. using LayoutDiff = typename Operator::LayoutDiff;
  339. using ElementGrad = typename Operator::ElementGrad;
  340. using LayoutGrad = typename Operator::LayoutGrad;
  341. using ElementAccumulator = typename Operator::ElementAccumulator;
  342. using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
  343. using OperatorArguments = typename Operator::Arguments;
  344. ConvolutionBackwardFilterOperation(char const* name = "unknown_gemm")
  345. : ConvolutionBackwardFilterOperationBase<Operator_>(name) {}
  346. virtual Status run(
  347. void const* arguments_ptr, void* device_workspace = nullptr,
  348. cudaStream_t stream = nullptr) const {
  349. cutlass::conv::Operator conv_op = this->m_description.conv_op;
  350. ConvolutionArguments const* conv_args =
  351. reinterpret_cast<ConvolutionArguments const*>(arguments_ptr);
  352. const auto& ps = conv_args->problem_size;
  353. OperatorArguments args;
  354. args.problem_size = ps;
  355. /// src in convolution arguments -> ref_src
  356. args.ref_src = {
  357. static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)),
  358. LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))};
  359. /// filter in convolution arguments -> ref_diff
  360. args.ref_diff = {
  361. static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)),
  362. LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))};
  363. /// dst in convolution arguments -> ref_grad
  364. args.ref_grad = {
  365. static_cast<ElementGrad*>(conv_args->dst),
  366. LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};
  367. args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get(
  368. conv_args);
  369. Operator op;
  370. Status status = op.initialize(args, device_workspace);
  371. if (status != Status::kSuccess) {
  372. return status;
  373. }
  374. return op.run(stream);
  375. }
  376. };
  377. ///////////////////////////////////////////////////////////////////////////////////////////////////
  378. } // namespace library
  379. } // namespace cutlass
  380. ///////////////////////////////////////////////////////////////////////////////////////////////////