From 35e9cc9845790f766c325d9349c1fcffa787428b Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 6 Dec 2021 16:30:09 +0800 Subject: [PATCH] feat(dnn/cuda): add cudnn frontend api GitOrigin-RevId: 9b18a57893fd7059a29edc11fc04c14d8176e547 --- dnn/CMakeLists.txt | 7 +- dnn/include/megdnn/algorithm_cache.h | 19 +- dnn/src/CMakeLists.txt | 2 + dnn/src/cuda/conv_bias/algo.cpp | 30 + dnn/src/cuda/conv_bias/algo.h | 143 ++++- dnn/src/cuda/conv_bias/cudnn_conv.cpp | 110 +--- dnn/src/cuda/conv_bias/cudnn_conv_base.cpp | 87 +++ .../cuda/conv_bias/cudnn_conv_bias_activation.cpp | 127 +--- .../conv_bias/cudnn_conv_bias_activation_base.cpp | 210 +++++++ .../conv_bias/cudnn_conv_bias_activation_v8.cpp | 145 +++++ dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp | 98 +++ dnn/src/cuda/conv_bias/helper.cpp | 54 +- dnn/src/cuda/conv_bias/helper.h | 9 + dnn/src/cuda/conv_bias/opr_impl.cpp | 11 + dnn/src/cuda/conv_bias/opr_impl.h | 7 + dnn/src/cuda/cudnn_wrapper_v8.cpp | 685 +++++++++++++++++++++ dnn/src/cuda/cudnn_wrapper_v8.h | 70 +++ dnn/src/cuda/handle.cpp | 15 + dnn/test/cuda/conv_v8.cpp | 304 +++++++++ third_party/prepare.sh | 1 + 20 files changed, 1905 insertions(+), 229 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/cudnn_conv_base.cpp create mode 100644 dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp create mode 100644 dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp create mode 100644 dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp create mode 100644 dnn/src/cuda/cudnn_wrapper_v8.cpp create mode 100644 dnn/src/cuda/cudnn_wrapper_v8.h create mode 100644 dnn/test/cuda/conv_v8.cpp diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt index aaccea26..d0f65241 100644 --- a/dnn/CMakeLists.txt +++ b/dnn/CMakeLists.txt @@ -54,9 +54,12 @@ if(MGE_WITH_CUDA) add_library(cutlass INTERFACE) target_include_directories( cutlass + INTERFACE $) + add_library(cudnn-frontend INTERFACE) + target_include_directories( + cudnn-frontend INTERFACE - $ - $) + $) endif() if(MGE_WITH_TEST) diff --git a/dnn/include/megdnn/algorithm_cache.h b/dnn/include/megdnn/algorithm_cache.h index 421df3b5..cb300843 100644 --- a/dnn/include/megdnn/algorithm_cache.h +++ b/dnn/include/megdnn/algorithm_cache.h @@ -22,7 +22,16 @@ public: bool operator==(const KeyStorage& k) const { return k1 == k.k1 && k2 == k.k2; } }; - struct Key { + struct Hash { + size_t operator()(const KeyStorage& k) const { + size_t h1 = k.k1; + size_t h2 = k.k2; + h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2); + return h1; + } + }; + + class Key { Handle* m_handle; uint32_t m_opr_type; const TensorLayout* m_inp_layouts_ptr; @@ -62,14 +71,6 @@ public: MGE_WIN_DECLSPEC_FUC void clear(); private: - struct Hash { - size_t operator()(const KeyStorage& k) const { - size_t h1 = k.k1; - size_t h2 = k.k2; - h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2); - return h1; - } - }; std::unordered_map m_heuristic_cache; #if __DEPLOY_ON_XP_SP2__ size_t m_mtx; diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index 80cfc43e..d37f377c 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -222,6 +222,8 @@ target_link_libraries(megdnn PUBLIC opr_param_defs) if(MGE_WITH_CUDA) target_link_libraries(megdnn PRIVATE $) target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR}) + + target_link_libraries(megdnn PRIVATE $) endif() if(MGE_WITH_ROCM) diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index f56d0699..df1485e7 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -14,6 +14,12 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { non_cudnn_algos.push_back(&matmul8x8x32); non_cudnn_algos.push_back(&batched_matmul); non_cudnn_algos.push_back(&int1_simple); + +#if CUDNN_VERSION > 8004 + all_algos.push_back(&cudnn_conv_v8); + all_algos.push_back(&cudnn_conv_bias_activation_v8); +#endif + fill_cudnn_algos(); for (auto&& algo : cudnn_conv_bias_activations) { all_algos.push_back(&algo); @@ -169,6 +175,30 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { nonlinear_mode_str.c_str()); } +param::Convolution ConvBiasForwardImpl::AlgoBase::get_param_convolution( + const SizeArgs& args) const { + param::Convolution::Mode mode; + param::Convolution::Sparse sparse = args.filter_meta.group > 1 + ? param::Convolution::Sparse::GROUP + : param::Convolution::Sparse::DENSE; + if (args.filter_meta.should_flip) { + mode = param::Convolution::Mode::CONVOLUTION; + } else { + mode = param::Convolution::Mode::CROSS_CORRELATION; + } + return param::Convolution{ + mode, + args.filter_meta.padding[0], + args.filter_meta.padding[1], + args.filter_meta.stride[0], + args.filter_meta.stride[1], + args.filter_meta.dilation[1], + args.filter_meta.dilation[0], + sparse, + args.filter_meta.format, + args.opr->param().compute_mode}; +} + void ConvBiasForwardImpl::AlgoPack::fill_cudnn_algos() { for (auto&& algo : CudnnAlgoPack::conv_fwd_algos()) { cudnn_conv_bias_activations.push_back(algo.first); diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 1af92c26..c9b237c3 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -76,6 +76,8 @@ public: CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, CUDA_SIMPLE_INT1, + CUDA_CUDNN_CONV_V8, + CUDA_CUDNN_CONVBIAS_V8, }; using Mapper = std::unordered_map; @@ -157,12 +159,40 @@ public: } virtual bool is_cudnn() const { return false; } + + param::Convolution get_param_convolution(const SizeArgs& args) const; +}; + +class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase : public AlgoBase { +public: + AlgoCUDNNConvBiasActivationBase() = default; + virtual ~AlgoCUDNNConvBiasActivationBase() = default; + + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_cudnn() const override { return true; } + + size_t get_preprocess_workspace_in_bytes(const SizeArgs& args) const override; + SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const override; + void exec_preprocess(const ExecArgs& args) const override; + +protected: + virtual size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void cudnn_execute( + const ExecArgs& args, const Workspace& workspace, float alpha, + float beta) const = 0; + +protected: + std::string m_name; }; -class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase { +class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final + : public AlgoCUDNNConvBiasActivationBase { public: AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum) - : m_cudnn_enum(cudnn_enum) { + : AlgoCUDNNConvBiasActivationBase(), m_cudnn_enum(cudnn_enum) { megdnn_assert( CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) != CudnnAlgoPack::conv_fwd_algos().end()); @@ -171,9 +201,6 @@ public: "CUDNN:ConvBiasActivation:" + m_attr.name, {}); } - size_t get_workspace_in_bytes(const SizeArgs& args) const override; - void exec(const ExecArgs& args) const override; - param::Convolution get_param_convolution(const SizeArgs& args) const; bool is_available(const SizeArgs&) const override; const char* name() const override { return m_name.c_str(); } @@ -191,8 +218,6 @@ public: cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; } - bool is_cudnn() const override { return true; } - MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS) std::string param() const override { @@ -202,11 +227,46 @@ public: } private: - std::string m_name; + size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override; + void cudnn_execute( + const ExecArgs& args, const Workspace& workspace, float alpha, + float beta) const override; + +private: cudnnConvolutionFwdAlgo_t m_cudnn_enum; CudnnAlgoPack::Attr m_attr; }; +#if CUDNN_VERSION > 8004 +class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8 final + : public AlgoCUDNNConvBiasActivationBase { +public: + AlgoCUDNNConvBiasActivationV8() : AlgoCUDNNConvBiasActivationBase() { + m_name = ConvBiasForward::algo_name( + "CUDNN:ConvBiasActivationV8", {}); + } + ~AlgoCUDNNConvBiasActivationV8() = default; + + bool is_available(const SizeArgs& args) const override; + + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; + } + + const char* name() const override { return m_name.c_str(); } + + MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS_V8) + + std::string param() const override { return ""; } + +private: + size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override; + void cudnn_execute( + const ExecArgs& args, const Workspace& workspace, float alpha, + float beta) const override; +}; +#endif + class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase { public: bool is_available(const SizeArgs& args) const override; @@ -284,9 +344,34 @@ private: mutable std::string m_name; }; -class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase { +class ConvBiasForwardImpl::AlgoCUDNNConvBase : public AlgoBase { +public: + AlgoCUDNNConvBase() = default; + virtual ~AlgoCUDNNConvBase() = default; + + size_t get_workspace_in_bytes(const SizeArgs& args) const override { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); + } + void exec(const ExecArgs& args) const override; + + bool is_cudnn() const override { return true; } + +protected: + virtual size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void cudnn_execute( + const ExecArgs& args, const Workspace& workspace) const = 0; + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; + +protected: + std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoCUDNNConvBase { public: - AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum) : m_cudnn_enum(cudnn_enum) { + AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum) + : AlgoCUDNNConvBase(), m_cudnn_enum(cudnn_enum) { megdnn_assert( CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) != CudnnAlgoPack::conv_fwd_algos().end()); @@ -296,8 +381,6 @@ public: } bool is_available(const SizeArgs& args) const override; - size_t get_workspace_in_bytes(const SizeArgs& args) const override; - void exec(const ExecArgs& args) const override; AlgoAttribute attribute() const override { auto ret = static_cast(0); @@ -314,8 +397,6 @@ public: cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; } - bool is_cudnn() const override { return true; } - MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV) std::string param() const override { @@ -325,12 +406,38 @@ public: } private: - std::string m_name; + size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override; + void cudnn_execute(const ExecArgs& args, const Workspace& workspace) const override; + +private: cudnnConvolutionFwdAlgo_t m_cudnn_enum; CudnnAlgoPack::Attr m_attr; +}; - WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; +#if CUDNN_VERSION > 8004 +class ConvBiasForwardImpl::AlgoCUDNNConvV8 final : public AlgoCUDNNConvBase { +public: + AlgoCUDNNConvV8() : AlgoCUDNNConvBase() { + m_name = ConvBiasForward::algo_name("CUDNN:ConvolutionV8", {}); + } + + bool is_available(const SizeArgs& args) const override; + + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; + } + + const char* name() const override { return m_name.c_str(); } + + MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV_V8) + + std::string param() const override { return ""; } + +private: + size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override; + void cudnn_execute(const ExecArgs& args, const Workspace& workspace) const override; }; +#endif //! compute small matmul in the kernel class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase { @@ -1140,6 +1247,10 @@ public: AlgoGroupConvGeneral group; AlgoBFloat16 bfloat16; AlgoSimpleInt1 int1_simple; +#if CUDNN_VERSION > 8004 + AlgoCUDNNConvV8 cudnn_conv_v8; + AlgoCUDNNConvBiasActivationV8 cudnn_conv_bias_activation_v8; +#endif AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp index 459c48bd..19109e74 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp @@ -56,99 +56,33 @@ bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(const SizeArgs& args) cons return status == CUDNN_STATUS_SUCCESS; } -WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle( - void* ptr, const SizeArgs& args) const { - auto dst_layout = *args.dst_layout; - SmallVector sizes; - if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { - dst_layout.dtype = DType(); - args.opr->check_or_deduce_dtype_fwd( - args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); - sizes.push_back(dst_layout.span().dist_byte()); - } - - if (args.z_layout->ndim > 0 && - args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - auto z_layout = *args.z_layout; - z_layout.dtype = DType(); - args.opr->check_or_deduce_dtype_fwd( - args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype); - sizes.push_back(z_layout.span().dist_byte()); - } - - SizeArgs conv_args = args; - conv_args.dst_layout = &dst_layout; - +size_t ConvBiasForwardImpl::AlgoCUDNNConv::cudnn_get_workspace_in_bytes( + const SizeArgs& args) const { CUDNNForwardDescs D; - conv_args.init_conv_desc(D); + args.init_conv_desc(D); size_t conv_workspace_size; - auto status = cudnnGetConvolutionForwardWorkspaceSize( - conv_args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc, - D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum, &conv_workspace_size); - megdnn_assert( - status == CUDNN_STATUS_SUCCESS, - "conv fwd get workspace failed: %s; info: %s", cudnnGetErrorString(status), - args.to_string().c_str()); - sizes.insert(sizes.begin(), conv_workspace_size); - return {ptr, std::move(sizes)}; -} - -size_t ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_in_bytes( - const SizeArgs& args) const { - return get_workspace_bundle(nullptr, args).total_size_in_bytes(); + cudnn_check(cudnnGetConvolutionForwardWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc, + D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum, + &conv_workspace_size)); + return conv_workspace_size; } -void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { - auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); - TensorND conv_dst_tensor = *args.dst_tensor; - if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout}; - conv_dst_tensor.layout.dtype = DType(); - args.opr->check_or_deduce_dtype_fwd( - args.src_layout->dtype, args.filter_layout->dtype, - conv_dst_tensor.layout.dtype); - } - - ExecArgs conv_args = args; - conv_args.dst_tensor = &conv_dst_tensor; - conv_args.dst_layout = &conv_dst_tensor.layout; - - { - CUDNNForwardDescs D; - conv_args.init_conv_desc(D); - auto conv_workspace = bundle.get_workspace(0); - float alpha = 1.0f, beta = 0.0f; - auto status = cudnnConvolutionForward( - conv_args.handle->cudnn_handle(), &alpha, D.src_desc.desc, - conv_args.src_tensor->raw_ptr(), D.filter_desc.desc, - conv_args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, - conv_workspace.raw_ptr, conv_workspace.size, &beta, D.dst_desc.desc, - conv_args.dst_tensor->raw_ptr()); - megdnn_assert( - status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s", - cudnnGetErrorString(status), conv_args.to_string().c_str()); - } - - if (args.z_layout->ndim > 0) { - auto z_tensor = *args.z_tensor; - if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { - z_tensor = TensorND{bundle.get(2), args.z_tensor->layout}; - z_tensor.layout.dtype = DType(); - args.opr->check_or_deduce_dtype_fwd( - args.src_layout->dtype, args.filter_layout->dtype, - z_tensor.layout.dtype); - auto typecvt = args.handle->create_operator(); - typecvt->exec(*args.z_tensor, z_tensor); - } - auto add = args.handle->create_operator(); - add->param().mode = Elemwise::Param::Mode::ADD; - add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor); - } - - handle_bias_and_nonlinear( - args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, - args.bias_tensor); +void ConvBiasForwardImpl::AlgoCUDNNConv::cudnn_execute( + const ExecArgs& args, const Workspace& workspace) const { + CUDNNForwardDescs D; + args.init_conv_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionForward( + args.handle->cudnn_handle(), &alpha, D.src_desc.desc, + args.src_tensor->raw_ptr(), D.filter_desc.desc, + args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, + workspace.raw_ptr, workspace.size, &beta, D.dst_desc.desc, + args.dst_tensor->raw_ptr()); + megdnn_assert( + status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_base.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_base.cpp new file mode 100644 index 00000000..e1129347 --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv_base.cpp @@ -0,0 +1,87 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv_base.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConvBase::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + if (args.z_layout->ndim > 0 && + args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + auto z_layout = *args.z_layout; + z_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype); + sizes.push_back(z_layout.span().dist_byte()); + } + + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + + size_t conv_workspace_size = cudnn_get_workspace_in_bytes(conv_args); + + sizes.insert(sizes.begin(), conv_workspace_size); + return {ptr, std::move(sizes)}; +} + +void ConvBiasForwardImpl::AlgoCUDNNConvBase::exec(const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + TensorND conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout}; + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + ExecArgs conv_args = args; + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + + cudnn_execute(conv_args, bundle.get_workspace(0)); + + if (args.z_layout->ndim > 0) { + auto z_tensor = *args.z_tensor; + if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + z_tensor = TensorND{bundle.get(2), args.z_tensor->layout}; + z_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, + z_tensor.layout.dtype); + auto typecvt = args.handle->create_operator(); + typecvt->exec(*args.z_tensor, z_tensor); + } + auto add = args.handle->create_operator(); + add->param().mode = Elemwise::Param::Mode::ADD; + add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor); + } + + handle_bias_and_nonlinear( + args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index cc4bb1f5..cc4bc191 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -124,10 +124,10 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( // forbits sigmoid for quantized if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) return false; - MEGDNN_FALLTHRU // XXX: why? - case param::ConvBias::NonlineMode::IDENTITY - : if (args.src_layout->dtype.category() == - DTypeCategory::QUANTIZED) break; + MEGDNN_FALLTHRU; // XXX: why? + case param::ConvBias::NonlineMode::IDENTITY: + if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) + break; if (m_cudnn_enum != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { // cudnn require algo to // CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM @@ -149,7 +149,7 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( return status == CUDNN_STATUS_SUCCESS; } -size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes( +size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::cudnn_get_workspace_in_bytes( const SizeArgs& args) const { CUDNNForwardDescs D; @@ -162,85 +162,18 @@ size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes( status == CUDNN_STATUS_SUCCESS, "conv fwd get workspace failed: %s; info: %s", cudnnGetErrorString(status), args.to_string().c_str()); - if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && - args.src_layout->dtype.category() != DTypeCategory::FLOAT) { - // cudnn require bias to be float when executing CONFIG_INT - // convert bias to float if bias is not float at first - workspace_size += sizeof(float) * args.bias_layout->span().dist_elem(); - } return workspace_size; } -void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( - const ExecArgs& args) const { +void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::cudnn_execute( + const ExecArgs& args, const Workspace& workspace, float alpha, + float beta) const { #if CUDNN_MAJOR < 7 megdnn_throw("ConvBias require cudnn 7.0 or higher"); #else megdnn_assert(cudnnGetVersion() >= 7401); CUDNNForwardDescs D; args.init_conv_bias_desc(D); - float alpha = 1.0f, beta = 0.0f; - if (args.z_layout->ndim > 0) - beta = 1.0f; - - auto get_scale = [](const DType& dtype) -> float { - megdnn_assert(dtype.category() == DTypeCategory::QUANTIZED); - switch (dtype.enumv()) { -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return dtype.param<_dt>().scale; - MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) -#undef cb - default: - megdnn_assert_internal(0); - } - }; - - auto src_dtype = args.src_layout->dtype, filter_dtype = args.filter_layout->dtype, - dst_dtype = args.dst_layout->dtype; - megdnn_assert( - (src_dtype.category() == dst_dtype.category()) || - (src_dtype.enumv() == DTypeEnum::QuantizedS8 && - dst_dtype.enumv() == DTypeEnum::Float32)); - megdnn_assert(src_dtype.category() == filter_dtype.category()); - - if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) { - auto expected_bias_scale = get_scale(args.src_layout->dtype) * - get_scale(args.filter_layout->dtype); - alpha = expected_bias_scale; - if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED) - alpha /= get_scale(args.dst_layout->dtype); - if (args.z_layout->ndim > 0 && - args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) { - beta = get_scale(args.z_layout->dtype) / get_scale(args.dst_layout->dtype); - } - if (args.bias_layout->dtype.category() == DTypeCategory::QUANTIZED) { - megdnn_assert( - fabs(expected_bias_scale - get_scale(args.bias_layout->dtype)) < - 1e-4); - } - } - - auto workspace_ptr = args.workspace.raw_ptr; - auto workspace_size = args.workspace.size; - auto bias_ptr = args.bias_tensor->raw_ptr(); - if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && - args.src_layout->dtype.category() != DTypeCategory::FLOAT) { - auto cvt = args.handle->create_operator(); - auto float_bias_layout = *args.bias_layout; - auto converted_bias_layout = *args.bias_layout; - converted_bias_layout.dtype = dtype::QuantizedS32(alpha); - float_bias_layout.dtype = dtype::Float32(); - auto bias_size_in_bytes = float_bias_layout.span().dist_byte(); - megdnn_assert(args.workspace.size >= bias_size_in_bytes); - cvt->exec( - {args.bias_tensor->raw_ptr(), converted_bias_layout}, - TensorND{workspace_ptr, float_bias_layout}); - - bias_ptr = workspace_ptr; - workspace_ptr += bias_size_in_bytes; - workspace_size -= bias_size_in_bytes; - } cudnnStatus_t status; if (args.z_layout->ndim == 0) { @@ -248,55 +181,23 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, args.src_tensor->raw_ptr(), D.filter_desc.desc, args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, - workspace_ptr, workspace_size, &beta, D.dst_desc.desc, - args.dst_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr, - D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr()); + workspace.raw_ptr, workspace.size, &beta, D.dst_desc.desc, + args.dst_tensor->raw_ptr(), D.bias_desc.desc, + args.bias_tensor->raw_ptr(), D.conv_desc.act_desc, D.dst_desc.desc, + args.dst_tensor->raw_ptr()); } else { status = cudnnConvolutionBiasActivationForward( args.handle->cudnn_handle(), &alpha, D.src_desc.desc, args.src_tensor->raw_ptr(), D.filter_desc.desc, args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum, - workspace_ptr, workspace_size, &beta, D.z_desc.desc, - args.z_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr, + workspace.raw_ptr, workspace.size, &beta, D.z_desc.desc, + args.z_tensor->raw_ptr(), D.bias_desc.desc, args.bias_tensor->raw_ptr(), D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr()); } megdnn_assert( status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s, algo %s", cudnnGetErrorString(status), args.to_string().c_str(), name()); - // Noline - switch (args.nonlinear_mode) { - case param::ConvBias::NonlineMode::RELU: - break; - case param::ConvBias::NonlineMode::SIGMOID: { - megdnn_assert( - args.dst_layout->dtype.category() != DTypeCategory::QUANTIZED); - auto&& elem_opr = args.handle->create_operator(); - elem_opr->param().mode = Elemwise::Param::Mode::SIGMOID; - elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); - break; - } - case param::ConvBias::NonlineMode::IDENTITY: - break; - case param::ConvBias::NonlineMode::H_SWISH: { - megdnn_assert( - args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED || - (args.dst_layout->dtype.category() == DTypeCategory::FLOAT && - args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW)); - if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED) { - auto&& elem_opr = args.handle->create_operator(); - elem_opr->param().mode = ElemwiseMultiType::Param::Mode::QH_SWISH; - elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); - } else { - auto&& elem_opr = args.handle->create_operator(); - elem_opr->param().mode = ElemwiseForward::Param::Mode::H_SWISH; - elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); - } - break; - } - default: - megdnn_throw("unsupported NonlineMode"); - } #endif } diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp new file mode 100644 index 00000000..e3de6360 --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp @@ -0,0 +1,210 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include "./algo.h" + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::get_workspace_in_bytes( + const SizeArgs& args) const { + auto workspace_size = cudnn_get_workspace_in_bytes(args); + + auto&& param = args.opr->param(); + if (args.preprocessed_filter == nullptr) { + if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && + args.src_layout->dtype.category() != DTypeCategory::FLOAT) { + // cudnn require bias to be float when executing CONFIG_INT + // convert bias to float if bias is not float at first + workspace_size += sizeof(float) * args.bias_layout->span().dist_elem(); + } + if (param.format == param::ConvBias::Format::NCHW32) { + workspace_size += args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte(); + } + } + return workspace_size; +} + +void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::exec( + const ExecArgs& args) const { + float alpha, beta; + std::tie(alpha, beta) = cudnn_get_conv_bias_act_scale_param( + args.src_tensor->layout, args.dst_tensor->layout, + args.filter_tensor->layout, args.bias_tensor->layout, + args.z_tensor->layout); + + auto workspace_ptr = args.workspace.raw_ptr; + auto workspace_size = args.workspace.size; + auto bias_ptr = args.bias_tensor->raw_ptr(); + TensorND filter_tensor; + TensorND bias_tensor; + + auto&& param = args.opr->param(); + if (args.preprocessed_filter != nullptr) { + bias_tensor = TensorND{ + args.bias_tensor->layout, + args.preprocessed_filter->tensors[0].raw_ptr()}; + if (param.format == Param::Format::NCHW32) { + megdnn_assert(args.preprocessed_filter->tensors.size() == 2); + filter_tensor = TensorND{ + args.filter_tensor->layout, + args.preprocessed_filter->tensors[1].raw_ptr()}; + } else { + filter_tensor = *args.filter_tensor; + } + } else { + if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && + args.src_layout->dtype.category() != DTypeCategory::FLOAT) { + auto cvt = args.handle->create_operator(); + auto float_bias_layout = *args.bias_layout; + auto converted_bias_layout = *args.bias_layout; + converted_bias_layout.dtype = dtype::QuantizedS32(alpha); + float_bias_layout.dtype = dtype::Float32(); + auto bias_size_in_bytes = float_bias_layout.span().dist_byte(); + megdnn_assert(args.workspace.size >= bias_size_in_bytes); + cvt->exec( + {args.bias_tensor->raw_ptr(), converted_bias_layout}, + TensorND{workspace_ptr, float_bias_layout}); + + bias_ptr = workspace_ptr; + workspace_ptr += bias_size_in_bytes; + workspace_size -= bias_size_in_bytes; + } + if (param.format == Param::Format::NCHW32) { + size_t reorder_workspace_size = + args.filter_tensor->layout.span().dist_byte() + + args.bias_tensor->layout.span().dist_byte(); + auto reorder_filter_ptr = workspace_ptr; + auto reorder_bias_ptr = + workspace_ptr + args.filter_tensor->layout.span().dist_byte(); + cudnn_reorder_filer_and_bias_nchw32( + cudnn_handle(args.opr->handle()), args.filter_tensor->raw_ptr(), + args.filter_meta, bias_ptr, reorder_filter_ptr, reorder_bias_ptr); + filter_tensor = TensorND(args.filter_tensor->layout, reorder_filter_ptr); + bias_ptr = reorder_bias_ptr; + workspace_ptr += reorder_workspace_size; + workspace_size -= reorder_workspace_size; + } else { + filter_tensor = *args.filter_tensor; + } + } + + bias_tensor = TensorND{args.bias_tensor->layout, bias_ptr}; + ExecArgs exec_args{ + const_cast(args.opr), + *args.src_tensor, + filter_tensor, + bias_tensor, + *args.z_tensor, + *args.dst_tensor, + args.workspace}; + Workspace cudnn_workspace{workspace_ptr, workspace_size}; + cudnn_execute(exec_args, cudnn_workspace, alpha, beta); + + // Noline + switch (args.nonlinear_mode) { + case param::ConvBias::NonlineMode::RELU: + break; + case param::ConvBias::NonlineMode::SIGMOID: { + megdnn_assert( + args.dst_layout->dtype.category() != DTypeCategory::QUANTIZED); + auto&& elem_opr = args.handle->create_operator(); + elem_opr->param().mode = Elemwise::Param::Mode::SIGMOID; + elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); + break; + } + case param::ConvBias::NonlineMode::IDENTITY: + break; + case param::ConvBias::NonlineMode::H_SWISH: { + megdnn_assert( + args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED || + (args.dst_layout->dtype.category() == DTypeCategory::FLOAT && + args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW)); + if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED) { + auto&& elem_opr = args.handle->create_operator(); + elem_opr->param().mode = ElemwiseMultiType::Param::Mode::QH_SWISH; + elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); + } else { + auto&& elem_opr = args.handle->create_operator(); + elem_opr->param().mode = ElemwiseForward::Param::Mode::H_SWISH; + elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); + } + break; + } + default: + megdnn_throw("unsupported NonlineMode"); + } +} + +size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + auto&& param = args.opr->param(); + if (param.format == Param::Format::NCHW32) { + return args.bias_layout->span().dist_byte(); + } + return 0_z; +} + +SmallVector ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase:: + deduce_preprocessed_filter_layout(const SizeArgs& args) const { + auto&& param = args.opr->param(); + if (param.format == Param::Format::NCHW32) { + return {args.bias_layout->collapse_contiguous(), + args.filter_layout->collapse_contiguous()}; + } else { + return {args.bias_layout->collapse_contiguous()}; + } +} + +void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::exec_preprocess( + const ExecArgs& args) const { + float alpha, beta; + std::tie(alpha, beta) = cudnn_get_conv_bias_act_scale_param( + args.src_tensor->layout, args.dst_tensor->layout, + args.filter_tensor->layout, args.bias_tensor->layout, + args.z_tensor->layout); + MEGDNN_MARK_USED_VAR(beta); + + auto workspace_ptr = args.workspace.raw_ptr; + auto workspace_size = args.workspace.size; + auto bias_ptr = workspace_size > 0 ? workspace_ptr + : args.preprocessed_filter->tensors[0].raw_ptr(); + if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && + args.src_layout->dtype.category() != DTypeCategory::FLOAT) { + auto cvt = args.handle->create_operator(); + auto float_bias_layout = *args.bias_layout; + auto converted_bias_layout = *args.bias_layout; + converted_bias_layout.dtype = dtype::QuantizedS32(alpha); + float_bias_layout.dtype = dtype::Float32(); + + cvt->exec( + {args.bias_tensor->raw_ptr(), converted_bias_layout}, + TensorND{bias_ptr, float_bias_layout}); + } + if (args.opr->param().format == Param::Format::NCHW32) { + auto reorder_filter_ptr = args.preprocessed_filter->tensors[1].raw_ptr(); + auto reorder_bias_ptr = args.preprocessed_filter->tensors[0].raw_ptr(); + cudnn_reorder_filer_and_bias_nchw32( + cudnn_handle(args.opr->handle()), args.filter_tensor->raw_ptr(), + args.filter_meta, bias_ptr, reorder_filter_ptr, reorder_bias_ptr); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp new file mode 100644 index 00000000..b4ba892a --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include "./algo.h" + +#include "src/common/conv_bias.h" +#include "src/cuda/cudnn_wrapper_v8.h" +#include "src/cuda/utils.h" + +#if CUDNN_VERSION >= 8004 +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +namespace { +TensorLayout canonical_bias_layout( + const TensorLayout& bias_layout, const param::ConvBias::Format format) { + int64_t vector_count, vector_dimension; + std::tie(vector_count, vector_dimension) = get_vector_count_and_dimension(format); + size_t channel = bias_layout[vector_dimension] * vector_count; + if (bias_layout.dtype.category() != DTypeCategory::FLOAT) { + return TensorLayout{{1, channel, 1, 1}, dtype::Float32()}; + } + return TensorLayout{{1, channel, 1, 1}, bias_layout.dtype}; +} +} // namespace + +bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::is_available( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + if (param.format == param::ConvBias::Format::NCHW4_NCHW32 || + param.format == param::ConvBias::Format::NCHW32_NCHW4 || + param.format == param::ConvBias::Format::NCHW4_NCHW || + param.format == param::ConvBias::Format::NCHW8 || + param.format == param::ConvBias::Format::NCHW64 || + param.format == param::ConvBias::Format::CHWN4) + return false; + if (param.format != Param::Format::NCHW && param.format != Param::Format::NHWC) { + if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { + return false; + } + } + if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || + args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && + args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) + return false; + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || + args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) + return false; + if (args.src_layout->dtype == args.filter_layout->dtype && + args.src_layout->dtype == dtype::BFloat16()) { + return false; + } + + if (args.bias_layout->ndim == 0 || + !check_bias_share_in_channel(*(args.bias_layout), param.format)) { + return false; + } + + // FIXME: cudnn cannot handle the case when the initial value of dst tensor + // contains nan and beta is zero, because the result of 0.f * nan is still + // nan + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 && + args.dst_layout->dtype.enumv() == DTypeEnum::Float32 && + param.format == param::ConvBias::Format::NCHW) { + return false; + } + + if (param.format == param::ConvBias::Format::NCHW32) { + // sm version + auto&& device_prop = current_device_prop(); + if (device_prop.major < 7 || (device_prop.major == 7 && device_prop.minor < 5)) + return false; + } + + switch (args.nonlinear_mode) { + case param::ConvBias::NonlineMode::RELU: + case param::ConvBias::NonlineMode::IDENTITY: + break; + case param::ConvBias::NonlineMode::SIGMOID: + // forbits sigmoid for quantized + if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) + return false; + break; + case param::ConvBias::NonlineMode::H_SWISH: + if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) + break; + return false; + default: + megdnn_throw("unsupported NonlineMode"); + } + + auto bias_layout = + canonical_bias_layout(*args.bias_layout, args.opr->param().format); + auto plan = get_heuristic_plan_from_opr( + static_cast(args.opr), *args.src_layout, + *args.dst_layout, *args.filter_layout, bias_layout, *args.z_layout, + args.filter_meta); + return plan != nullptr; +} + +size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::cudnn_get_workspace_in_bytes( + const SizeArgs& args) const { + auto bias_layout = + canonical_bias_layout(*args.bias_layout, args.opr->param().format); + auto plan = get_heuristic_plan_from_opr( + static_cast(args.opr), *args.src_layout, + *args.dst_layout, *args.filter_layout, bias_layout, *args.z_layout, + args.filter_meta); + megdnn_assert( + plan != nullptr, "algo(%s) cannot find execution from heuristics", name()); + return plan->getWorkspaceSize(); +} + +void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::cudnn_execute( + const ExecArgs& args, const Workspace& workspace, float alpha, + float beta) const { + auto&& bias_layout = + canonical_bias_layout(args.bias_tensor->layout, args.opr->param().format); + auto plan = get_heuristic_plan_from_opr( + static_cast(args.opr), args.src_tensor->layout, + args.dst_tensor->layout, args.filter_tensor->layout, bias_layout, + args.z_tensor->layout, args.filter_meta); + megdnn_assert( + plan != nullptr, "algo(%s) cannot find execution from heuristics", name()); + auto&& handle = cudnn_handle(args.handle); + TensorND bias_tensor{args.bias_tensor->raw_ptr(), bias_layout}; + run_conv_bias_act_with_plan( + handle, *plan, *args.src_tensor, *args.dst_tensor, *args.filter_tensor, + bias_tensor, *args.z_tensor, workspace); +} + +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp new file mode 100644 index 00000000..df41c7d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp @@ -0,0 +1,98 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/cudnn_wrapper_v8.h" +#include "src/cuda/utils.h" + +#if CUDNN_VERSION >= 8004 +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoCUDNNConvV8::is_available(const SizeArgs& args) const { + if (args.filter_meta.format != Param::Format::NCHW && + args.filter_meta.format != Param::Format::NHWC) { + if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { + return false; + } + } + + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || + args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + return false; + } + + // FIXME: cudnn cannot handle the case when the initial value of dst tensor + // contains nan and beta is zero, because the result of 0.f * nan is still + // nan + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 && + args.dst_layout->dtype.enumv() == DTypeEnum::Float32 && + args.opr->param().format == param::ConvBias::Format::NCHW) { + return false; + } + + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd( + args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); + } + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + + if (!is_cudnn_supported(conv_args)) + return false; + + auto conv_opr = args.handle->create_operator(); + conv_opr->param() = get_param_convolution(args); + ConvolutionForward::CanonizedFilterMeta fm; + fm.copy_from(args.filter_meta); + auto plan = get_heuristic_plan_from_opr( + conv_opr.get(), *conv_args.src_layout, *conv_args.dst_layout, + *conv_args.filter_layout, {}, {}, fm); + return plan != nullptr; +} + +size_t ConvBiasForwardImpl::AlgoCUDNNConvV8::cudnn_get_workspace_in_bytes( + const SizeArgs& args) const { + auto conv_opr = args.handle->create_operator(); + conv_opr->param() = get_param_convolution(args); + ConvolutionForward::CanonizedFilterMeta fm; + fm.copy_from(args.filter_meta); + auto plan = get_heuristic_plan_from_opr( + conv_opr.get(), *args.src_layout, *args.dst_layout, *args.filter_layout, {}, + {}, fm); + megdnn_assert( + plan != nullptr, "algo(%s) cannot find execution from heuristics", name()); + return plan->getWorkspaceSize(); +} + +void ConvBiasForwardImpl::AlgoCUDNNConvV8::cudnn_execute( + const ExecArgs& args, const Workspace& workspace) const { + auto conv_opr = args.handle->create_operator(); + conv_opr->param() = get_param_convolution(args); + ConvolutionForward::CanonizedFilterMeta fm; + fm.copy_from(args.filter_meta); + auto plan = get_heuristic_plan_from_opr( + conv_opr.get(), args.src_tensor->layout, args.dst_tensor->layout, + args.filter_tensor->layout, {}, {}, fm); + megdnn_assert( + plan != nullptr, "algo(%s) cannot find execution from heuristics", name()); + auto&& handle = cudnn_handle(args.handle); + run_single_conv_with_plan( + handle, *plan, *args.src_tensor, *args.dst_tensor, *args.filter_tensor, + workspace); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/helper.cpp b/dnn/src/cuda/conv_bias/helper.cpp index 6ecf0d2b..1a9d529e 100644 --- a/dnn/src/cuda/conv_bias/helper.cpp +++ b/dnn/src/cuda/conv_bias/helper.cpp @@ -197,8 +197,60 @@ void flip_filter( ref_ptr.reset(workspace.raw_ptr); } -} // namespace conv_bias +std::pair cudnn_get_conv_bias_act_scale_param( + const TensorLayout& x, const TensorLayout& y, const TensorLayout& w, + const TensorLayout& b, const TensorLayout& z) { + float alpha = 1.f, beta = 0.f; + if (z.ndim > 0) + beta = 1.f; + + auto get_scale = [](const DType& dtype) -> float { + megdnn_assert(dtype.category() == DTypeCategory::QUANTIZED); + switch (dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return dtype.param<_dt>().scale; + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) +#undef cb + default: + megdnn_assert_internal(0); + } + }; + auto x_dtype = x.dtype, y_dtype = y.dtype, w_dtype = w.dtype; + megdnn_assert( + (x_dtype.category() == y_dtype.category()) || + (x_dtype.enumv() == DTypeEnum::QuantizedS8 && + y_dtype.enumv() == DTypeEnum::Float32)); + megdnn_assert(x_dtype.category() == w_dtype.category()); + + if (x_dtype.category() == DTypeCategory::QUANTIZED) { + auto expected_bias_scale = get_scale(x_dtype) * get_scale(w_dtype); + alpha = expected_bias_scale; + if (y_dtype.category() == DTypeCategory::QUANTIZED) + alpha /= get_scale(y_dtype); + if (z.ndim > 0 && z.dtype.category() == DTypeCategory::QUANTIZED) { + beta = get_scale(z.dtype) / get_scale(y_dtype); + } + if (b.dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(fabs(expected_bias_scale - get_scale(b.dtype)) < 1e-4); + } + } + return {alpha, beta}; +} + +void cudnn_reorder_filer_and_bias_nchw32( + const cudnnHandle_t& handle, const void* filter_ptr, + const CanonizedFilterMeta& fm, const void* bias_ptr, void* reordered_filter_ptr, + void* reordered_bias_ptr) { + FilterDesc filter_desc; + filter_desc.set(fm); + int reorder_bias = bias_ptr != nullptr; + cudnn_check(cudnnReorderFilterAndBias( + handle, filter_desc.desc, CUDNN_DEFAULT_REORDER, filter_ptr, + reordered_filter_ptr, reorder_bias, bias_ptr, reordered_bias_ptr)); +} +} // namespace conv_bias } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/helper.h b/dnn/src/cuda/conv_bias/helper.h index f28008f4..5a4b18b2 100644 --- a/dnn/src/cuda/conv_bias/helper.h +++ b/dnn/src/cuda/conv_bias/helper.h @@ -113,6 +113,15 @@ struct CUDNNForwardDescs { } }; +std::pair cudnn_get_conv_bias_act_scale_param( + const TensorLayout& x, const TensorLayout& y, const TensorLayout& w, + const TensorLayout& b, const TensorLayout& z); + +void cudnn_reorder_filer_and_bias_nchw32( + const cudnnHandle_t& handle, const void* filter_ptr, + const CanonizedFilterMeta& fm, const void* bias_ptr, void* reordered_filter_ptr, + void* reordered_bias_ptr); + } // namespace conv_bias } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index 2c88af5f..8f67ffeb 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -47,6 +47,17 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { using namespace conv_bias; AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; +#if CUDNN_VERSION >= 8004 + if (sm_algo_pack.cudnn_conv_v8.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) { + return &sm_algo_pack.cudnn_conv_v8; + } + if (sm_algo_pack.cudnn_conv_bias_activation_v8.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) { + return &sm_algo_pack.cudnn_conv_bias_activation_v8; + } +#endif + auto dst_layout = *args.dst_layout; if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { dst_layout.dtype = DType(); diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 3ec528ad..2bb5eaeb 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -1,6 +1,7 @@ #pragma once #include "../elemwise/opr_impl.h" #include "megdnn/oprs.h" +#include "src/cuda/cudnn_with_check.h" namespace megdnn { namespace cuda { @@ -65,6 +66,12 @@ public: // The following algorithms are suitable for channel wise convolution class AlgoFloat32NCHWFMAImplicitBatchedGemm; class AlgoFloat16NCHWHMMAImplicitBatchedGemm; + class AlgoCUDNNConvBase; + class AlgoCUDNNConvBiasActivationBase; +#if CUDNN_VERSION > 8004 + class AlgoCUDNNConvV8; + class AlgoCUDNNConvBiasActivationV8; +#endif class AlgoPack; diff --git a/dnn/src/cuda/cudnn_wrapper_v8.cpp b/dnn/src/cuda/cudnn_wrapper_v8.cpp new file mode 100644 index 00000000..744eb8f7 --- /dev/null +++ b/dnn/src/cuda/cudnn_wrapper_v8.cpp @@ -0,0 +1,685 @@ +/** + * \file dnn/src/cuda/cudnn_wrapper_v8.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/cudnn_wrapper_v8.h" +#include "src/cuda/cudnn_wrapper.h" + +#include "src/common/utils.h" +#include "src/cuda/utils.h" + +#include "src/cuda/conv_bias/helper.h" + +#include "cudnn_frontend_EngineConfigGenerator.h" + +#include "megdnn/heuristic_cache.h" + +using namespace megdnn; +using namespace cuda; + +// helper functions for underlying descriptors +namespace { +cudnnDataType_t get_cudnn_data_type(DType type) { + switch (type.enumv()) { + case DTypeEnum::Float32: + return CUDNN_DATA_FLOAT; + case DTypeEnum::Float16: + return CUDNN_DATA_HALF; + case DTypeEnum::Int32: + case DTypeEnum::QuantizedS32: + return CUDNN_DATA_INT32; + case DTypeEnum::QuantizedS8: + case DTypeEnum::Int8: + return CUDNN_DATA_INT8; + default: + megdnn_throw("dtype must be float16/float32/int8/qint8/int32/qint32"); + } +} + +cudnnDataType_t get_compute_type( + DType type, param::Convolution::ComputeMode comp_mode) { + if (type.enumv() == DTypeEnum::Float32) { + return CUDNN_DATA_FLOAT; + } else if (type.enumv() == DTypeEnum::Float16) { + return get_compute_type_fp16(comp_mode); + } else if ( + type.category() == DTypeCategory::INT || + type.category() == DTypeCategory::QUANTIZED) { + return CUDNN_DATA_INT32; + } else { + megdnn_throw("unsupported compute type for convolution"); + } +} + +using Format = param::Convolution::Format; +using IntArrayRef = SmallVector; +std::pair get_shape_and_stride( + const TensorLayout& layout, const Format format, int64_t nr_group) { + // DENSE: n, c, h, w + // n, k, p, q; ndim = 4 + // GROUP: n, g, c, h, w + // n, g, k, p, q; ndim = 5 + static constexpr size_t CUDNN_NDIM = 4; + size_t cudnn_ndim = CUDNN_NDIM; + if (nr_group > 1) + cudnn_ndim += 1; + IntArrayRef shape(cudnn_ndim); + IntArrayRef stride(cudnn_ndim); + + if (format == Format::NCHW4 || format == Format::NCHW32) + megdnn_assert_eq_size_t(layout.ndim, 5_z); + else + megdnn_assert_eq_size_t(layout.ndim, 4_z); + + size_t c_pos, spatial_pos; + if (format == Format::NCHW || format == Format::NCHW4 || format == Format::NCHW32) { + c_pos = 1; + spatial_pos = 2; + } else { + megdnn_assert(format == Format::NHWC); + c_pos = 3; + spatial_pos = 1; + } + int64_t vector_count, vector_dimension; + std::tie(vector_count, vector_dimension) = get_vector_count_and_dimension(format); + + size_t out_c_pos = nr_group == 1 ? 1 : 2; + size_t out_spatial_pos = nr_group == 1 ? 2 : 3; + // For NCHW4 and NCHW32 we still compute standard strides here to input to cuDNN + // functions. We will manually scale by resizeFactor in the cpu ref. + shape[0] = layout[0]; + if (nr_group > 1) + shape[1] = nr_group; + shape[out_c_pos] = layout[c_pos] / nr_group; + shape[out_spatial_pos] = layout[spatial_pos]; + shape[out_spatial_pos + 1] = layout[spatial_pos + 1]; + if (c_pos == 1) { + stride[cudnn_ndim - 1] = 1; + for (int i = cudnn_ndim - 2; i >= 0; --i) { + stride[i] = stride[i + 1] * shape[i + 1]; + } + } else { + megdnn_assert(c_pos == 3); // Here we assume that the format is NHWC + stride[out_c_pos] = 1; + if (nr_group > 1) + stride[1] = shape[out_c_pos] * stride[out_c_pos]; + stride[out_spatial_pos + 1] = stride[1] * shape[1]; + stride[out_spatial_pos] = + stride[out_spatial_pos + 1] * shape[out_spatial_pos + 1]; + stride[0] = stride[out_spatial_pos] * shape[out_spatial_pos]; + } + return {shape, stride}; +} + +/* --------------- make cudnn-frontend tensor descriptor --------------- */ +auto make_tensor_descriptor( + int64_t id, uint8_t alignment, const TensorLayout& layout, const Format format, + int64_t nr_group, bool is_virtual = false) { + int64_t vector_count, vector_dimension; + std::tie(vector_count, vector_dimension) = get_vector_count_and_dimension(format); + IntArrayRef shape, stride; + std::tie(shape, stride) = get_shape_and_stride(layout, format, nr_group); + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(stride.size(), stride.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(get_cudnn_data_type(layout.dtype)) + .setVirtual(is_virtual) + .setVectorCountAndDimension(vector_count, vector_dimension) + .build(); +} + +/* --------------- make cudnn-frontend filter descriptor --------------- */ +template +cudnn_frontend::Tensor make_filter_descriptor(uint8_t alignment, const FilterMeta& fm) { + // DENSE: k, c, r, s; ndim = 4 + // GROUP: g, k, c, r, s; ndim = 5 + // generate shape and stride + static constexpr size_t CUDNN_NDIM = 4; + size_t cudnn_ndim = CUDNN_NDIM; + if (fm.group > 1) + cudnn_ndim += 1; + IntArrayRef shape(cudnn_ndim), stride(cudnn_ndim); + auto format = fm.format; + int64_t vector_count, vector_dimension; + std::tie(vector_count, vector_dimension) = get_vector_count_and_dimension(format); + + int64_t group = fm.group; + size_t out_ch_pos = group == 1 ? 0 : 1; + size_t in_ch_pos = group == 1 ? 1 : 2; + size_t filter_start = group == 1 ? 2 : 3; + if (group > 1) + shape[0] = group; + shape[out_ch_pos] = fm.ocpg; + shape[in_ch_pos] = fm.icpg / vector_count; + shape[filter_start] = fm.spatial[0]; + shape[filter_start + 1] = fm.spatial[1]; + if (format == Format::NCHW || format == Format::NCHW4 || format == Format::NCHW32) { + stride[cudnn_ndim - 1] = 1; + for (int i = cudnn_ndim - 2; i >= 0; --i) { + stride[i] = stride[i + 1] * shape[i + 1]; + } + } else { + megdnn_assert( + format == Format::NHWC); // Here we assume that the format is NHWC + stride[in_ch_pos] = 1; + stride[filter_start + 1] = stride[in_ch_pos] * shape[in_ch_pos]; + stride[filter_start] = stride[filter_start + 1] * shape[filter_start + 1]; + stride[out_ch_pos] = stride[filter_start] * shape[filter_start]; + if (group > 1) + stride[0] = stride[out_ch_pos] * shape[out_ch_pos]; + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(stride.size(), stride.data()) + .setId('w') // weight descriptor + .setAlignment(alignment) + .setDataType(get_cudnn_data_type(fm.dtype)) + .setVectorCountAndDimension(vector_count, vector_dimension) + .build(); +} + +/* --------------- make cudnn-frontend conv descriptor --------------- */ +template +cudnn_frontend::ConvDesc_v8 make_conv_descriptor( + cudnnDataType_t data_type, const Param& param) { + IntArrayRef padding = {param.pad_h, param.pad_w}; + IntArrayRef stride = {param.stride_h, param.stride_w}; + IntArrayRef dilation = {param.dilate_h, param.dilate_w}; + uint64_t conv_dim = stride.size(); + cudnnConvolutionMode_t mode; + switch (param.mode) { + case Param::Mode::CROSS_CORRELATION: + mode = CUDNN_CROSS_CORRELATION; + break; + case Param::Mode::CONVOLUTION: + mode = CUDNN_CONVOLUTION; + break; + default: + megdnn_throw("conv mode must be conv or xcorr."); + } + return cudnn_frontend::ConvDescBuilder() + .setDataType(data_type) + .setMathMode(mode) + .setNDims(conv_dim) + .setStrides(conv_dim, stride.data()) + .setPrePadding(conv_dim, padding.data()) + .setPostPadding(conv_dim, padding.data()) + .setDilation(conv_dim, dilation.data()) + .build(); +} + +/* --------------- make cudnn-frontend activation descriptor --------------- */ +auto make_activation_descriptor( + DType data_type, const param::ConvBias::NonlineMode nonline_mode) { + cudnnPointwiseMode_t mode; + using NonlineMode = param::ConvBias::NonlineMode; + switch (nonline_mode) { + case NonlineMode::RELU: + mode = CUDNN_POINTWISE_RELU_FWD; + break; + case NonlineMode::SIGMOID: + mode = CUDNN_POINTWISE_SIGMOID_FWD; + break; + default: + megdnn_throw("unsupported non linear mode"); + } + return cudnn_frontend::PointWiseDescBuilder() + .setMode(mode) + .setMathPrecision(get_cudnn_data_type(data_type)) + .build(); +} + +// high-level api for convolution execution +struct StaticData { + using Key = megdnn::HeuristicCache::Key; + using KeyStorage = megdnn::HeuristicCache::KeyStorage; + using KeyHash = megdnn::HeuristicCache::Hash; + using Result = cudnn_frontend::ExecutionPlan; + using CudnnFrontendExecutionPlanCache = + std::unordered_map; + CudnnFrontendExecutionPlanCache cache; +#if __DEPLOY_ON_XP_SP2__ + size_t cache_mutex; +#else + std::mutex cache_mutex; +#endif + cudnnBackendHeurMode_t heur_mode = CUDNN_HEUR_MODE_INSTANT; + bool deterministic = true; +}; + +StaticData& static_data() { + static StaticData inst; + return inst; +} + +template +struct CudnnBackendOpTypeTrait; + +template <> +struct CudnnBackendOpTypeTrait { + static constexpr cudnnBackendDescriptorType_t OPERATION = + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR; +}; + +template <> +struct CudnnBackendOpTypeTrait { + static constexpr cudnnBackendDescriptorType_t OPERATION = + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR; +}; + +template <> +struct CudnnBackendOpTypeTrait { + static constexpr cudnnBackendDescriptorType_t OPERATION = + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR; +}; + +auto build_opgraph( + const cudnnHandle_t& handle, const cudnnBackendDescriptorType_t operation, + const cudnn_frontend::Tensor& x, const cudnn_frontend::Tensor& y, + const cudnn_frontend::Tensor& w, const cudnn_frontend::ConvDesc_v8& conv_desc) { + auto op = cudnn_frontend::OperationBuilder(operation) + .setxDesc(x) + .setyDesc(y) + .setwDesc(w) + .setcDesc(conv_desc) + .build(); + std::array ops = {&op}; + auto op_graph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(1, ops.data()) + .build(); + return op_graph; +} + +auto build_opgraph_fused( + const cudnnHandle_t& handle, const cudnn_frontend::Tensor& x, + const cudnn_frontend::Tensor& y, const cudnn_frontend::Tensor& w, + const cudnn_frontend::Tensor& b, const cudnn_frontend::Tensor& z, + const cudnn_frontend::Tensor& after_add, + const cudnn_frontend::Tensor& after_bias, + const cudnn_frontend::Tensor& after_conv, + const cudnn_frontend::ConvDesc_v8& conv_desc, + const cudnn_frontend::PointWiseDesc_v8& act_desc, float alpha, float beta) { + const auto precision = CUDNN_DATA_FLOAT; + + // add z + auto add_desc1 = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + // add bias + auto add_desc2 = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + + // create conv node + auto conv_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) + .setxDesc(x) + .setyDesc(after_conv) + .setwDesc(w) + .setcDesc(conv_desc) + .build(); + + // create add z node + auto add_op1 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(conv_op.getOutputTensor()) + .setbDesc(z) + .setyDesc(after_add) + .setpwDesc(add_desc1) + .setAlpha(alpha) + .setAlpha2(beta) + .build(); + + // create add bias node + auto add_op2 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op1.getOutputTensor()) + .setbDesc(b) + .setyDesc(after_bias) + .setpwDesc(add_desc2) + .build(); + + // create act node + auto act_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op2.getOutputTensor()) + .setyDesc(y) + .setpwDesc(act_desc) + .build(); + + std::array ops = { + &conv_op, &add_op1, &add_op2, &act_op}; + + auto op_graph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + return op_graph; +} + +auto build_opgraph_fused_nonactivation( + const cudnnHandle_t& handle, const cudnn_frontend::Tensor& x, + const cudnn_frontend::Tensor& y, const cudnn_frontend::Tensor& w, + const cudnn_frontend::Tensor& b, const cudnn_frontend::Tensor& z, + const cudnn_frontend::Tensor& after_add, + const cudnn_frontend::Tensor& after_conv, + const cudnn_frontend::ConvDesc_v8& conv_desc, float alpha, float beta) { + const auto precision = CUDNN_DATA_FLOAT; + + // add z + auto add_desc1 = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + // add bias + auto add_desc2 = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + + // create conv node + auto conv_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) + .setxDesc(x) + .setyDesc(after_conv) + .setwDesc(w) + .setcDesc(conv_desc) + .build(); + + // create add z node + auto add_op1 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(conv_op.getOutputTensor()) + .setbDesc(z) + .setyDesc(after_add) + .setpwDesc(add_desc1) + .setAlpha(alpha) + .setAlpha2(beta) + .build(); + + // create add bias node + auto add_op2 = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op1.getOutputTensor()) + .setbDesc(b) + .setyDesc(y) + .setpwDesc(add_desc2) + .build(); + + std::array ops = { + &conv_op, &add_op1, &add_op2}; + + auto op_graph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + return op_graph; +} + +void filter_engine_configs( + cudnn_frontend::EngineConfigList& from, cudnn_frontend::EngineConfigList& to, + bool deterministic) { + auto filter = [&deterministic](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote( + c)) { + return true; + } + } + if (cudnn_frontend::hasNumericalNote( + c)) { + return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); +} +}; // namespace + +/* --------- get heuristic plan from megdnn opr -------- */ +template +cudnn_frontend::ExecutionPlan* megdnn::cuda::get_heuristic_plan_from_opr( + const Opr* opr, const TensorLayout& x, const TensorLayout& y, + const TensorLayout& w, const TensorLayout& b, const TensorLayout& z, + const typename Opr::CanonizedFilterMeta& fm) { + auto&& param = opr->param(); + TensorLayoutArray layouts{x, y, w}; + auto key = StaticData::Key{opr->handle(), opr->get_opr_type(), + layouts.data(), layouts.size(), + ¶m, sizeof(param)} + .build_key_storage(); + auto& cache = static_data().cache; + { + MEGDNN_LOCK_GUARD(static_data().cache_mutex); + auto iter = cache.find(key); + if (iter != cache.end()) { + return &iter->second; + } + } + + size_t aligned = 16; + uint8_t alignment = std::min(opr->handle()->alignment_requirement(), aligned); + auto&& handle = cudnn_handle(opr->handle()); + auto&& x_desc = make_tensor_descriptor('x', alignment, x, fm.format, fm.group); + auto&& y_desc = make_tensor_descriptor('y', alignment, y, fm.format, fm.group); + auto&& w_desc = make_filter_descriptor(alignment, fm); + auto compute_type = get_compute_type(x.dtype, param.compute_mode); + auto&& conv_desc = make_conv_descriptor(compute_type, param); + constexpr auto operation = CudnnBackendOpTypeTrait::OPERATION; + auto op_graph = build_opgraph(handle, operation, x_desc, y_desc, w_desc, conv_desc); + auto deterministic = static_data().deterministic; + auto heur_mode = static_data().heur_mode; + auto heurgen_method = [&deterministic, + &heur_mode](cudnn_frontend::OperationGraph& op_graph) + -> cudnn_frontend::EngineConfigList { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(op_graph) + .setHeurMode(heur_mode) + .build(); + auto& engine_configs = + heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + filter_engine_configs(engine_configs, filtered_configs, deterministic); + return filtered_configs; + }; + + auto fallback_method = [&deterministic, &heur_mode, + &operation](cudnn_frontend::OperationGraph& op_graph) + -> cudnn_frontend::EngineConfigList { + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(op_graph) + .setOperation(operation) + .build(); + auto& fallback_list = fallback.getFallbackList(); + cudnn_frontend::EngineConfigList filtered_configs; + filter_engine_configs(fallback_list, filtered_configs, deterministic); + return filtered_configs; + }; + + std::array sources = { + heurgen_method, fallback_method}; + + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(op_graph); + + for (auto& config : configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config) + .build(); + auto workspace_size = plan.getWorkspaceSize(); + MEGDNN_MARK_USED_VAR(workspace_size); + MEGDNN_LOCK_GUARD(static_data().cache_mutex); + auto insert = cache.insert(std::make_pair(key, std::move(plan))); + return &insert.first->second; + } catch (cudnn_frontend::cudnnException& e) { + continue; + } + } + return nullptr; +} + +#define INST(_Opr) \ + template cudnn_frontend::ExecutionPlan* megdnn::cuda::get_heuristic_plan_from_opr( \ + const _Opr* opr, const TensorLayout& x, const TensorLayout& y, \ + const TensorLayout& w, const TensorLayout& b, const TensorLayout& z, \ + const typename _Opr::CanonizedFilterMeta& fm); + +INST(ConvolutionForward); +INST(ConvolutionBackwardData); +INST(ConvolutionBackwardFilter); + +/* --------- get heuristic plan from conv_bias opr -------- */ +template <> +cudnn_frontend::ExecutionPlan* megdnn::cuda::get_heuristic_plan_from_opr( + const ConvBiasForward* opr, const TensorLayout& x, const TensorLayout& y, + const TensorLayout& w, const TensorLayout& b, const TensorLayout& z, + const typename ConvBiasForward::CanonizedFilterMeta& fm) { + auto&& param = opr->param(); + TensorLayoutArray layouts{x, y, w, b, z}; + auto key = StaticData::Key{opr->handle(), opr->get_opr_type(), + layouts.data(), layouts.size(), + ¶m, sizeof(param)} + .build_key_storage(); + auto& cache = static_data().cache; + { + MEGDNN_LOCK_GUARD(static_data().cache_mutex); + auto iter = cache.find(key); + if (iter != cache.end()) { + return &iter->second; + } + } + + size_t aligned = 16; + uint8_t alignment = std::min(opr->handle()->alignment_requirement(), aligned); + auto&& handle = cudnn_handle(opr->handle()); + auto&& x_desc = make_tensor_descriptor('x', alignment, x, fm.format, fm.group); + auto&& y_desc = make_tensor_descriptor('y', alignment, y, fm.format, fm.group); + auto&& w_desc = make_filter_descriptor(alignment, fm); + auto&& z_desc = make_tensor_descriptor('z', alignment, y, fm.format, fm.group); + auto&& b_desc = make_tensor_descriptor('b', alignment, b, Format::NCHW, fm.group); + auto&& after_conv = + make_tensor_descriptor('C', alignment, y, fm.format, fm.group, true); + auto&& after_add = + make_tensor_descriptor('A', alignment, y, fm.format, fm.group, true); + auto&& after_bias = + make_tensor_descriptor('B', alignment, y, fm.format, fm.group, true); + auto compute_type = get_compute_type(x.dtype, param.compute_mode); + auto&& conv_desc = make_conv_descriptor(compute_type, param); + float alpha, beta; + std::tie(alpha, beta) = + conv_bias::cudnn_get_conv_bias_act_scale_param(x, y, w, b, z); + // Because the OperationGraph has no public copy constructor and default + // constructor, here we use a lambda function to bypass the compile error. + auto get_op_graph = [&]() { + if (param.nonlineMode == param::ConvBias::NonlineMode::IDENTITY) { + return build_opgraph_fused_nonactivation( + handle, x_desc, y_desc, w_desc, b_desc, z_desc, after_add, + after_conv, conv_desc, alpha, beta); + } else { + auto&& act_desc = + make_activation_descriptor(dtype::Float32(), param.nonlineMode); + return build_opgraph_fused( + handle, x_desc, y_desc, w_desc, b_desc, z_desc, after_add, + after_bias, after_conv, conv_desc, act_desc, alpha, beta); + } + }; + auto op_graph = get_op_graph(); + auto deterministic = static_data().deterministic; + auto heur_mode = static_data().heur_mode; + auto heurgen_method = [&deterministic, + &heur_mode](cudnn_frontend::OperationGraph& op_graph) + -> cudnn_frontend::EngineConfigList { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(op_graph) + .setHeurMode(heur_mode) + .build(); + auto& engine_configs = + heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + filter_engine_configs(engine_configs, filtered_configs, deterministic); + return filtered_configs; + }; + + std::array sources = {heurgen_method}; + + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(op_graph); + + for (auto& config : configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config) + .build(); + auto workspace_size = plan.getWorkspaceSize(); + MEGDNN_MARK_USED_VAR(workspace_size); + MEGDNN_LOCK_GUARD(static_data().cache_mutex); + auto insert = cache.insert(std::make_pair(key, std::move(plan))); + return &insert.first->second; + } catch (cudnn_frontend::cudnnException& e) { + continue; + } + } + return nullptr; +} + +/* ------ impl for running a single conv ----- */ +void megdnn::cuda::run_single_conv_with_plan( + const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan, + const TensorND& x, const TensorND& y, const TensorND& w, + const Workspace& workspace) { + size_t workspace_size = plan.getWorkspaceSize(); + megdnn_assert( + workspace.size >= workspace_size, + "workspace does not meet the requirement of execution " + "plan(got:%zu,expected:%zu)", + workspace.size, workspace_size); + void* data_ptrs[] = {x.raw_ptr(), y.raw_ptr(), w.raw_ptr()}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace.raw_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + cudnn_check(cudnnBackendExecute( + handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); +} + +/* ------ impl for running a fused conv bias activation ----- */ +void megdnn::cuda::run_conv_bias_act_with_plan( + const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan, + const TensorND& x, const TensorND& y, const TensorND& w, const TensorND& b, + const TensorND& z, const Workspace& workspace) { + size_t workspace_size = plan.getWorkspaceSize(); + megdnn_assert( + workspace.size >= workspace_size, + "workspace does not meet the requirement of execution " + "plan(got:%zu,expected:%zu)", + workspace.size, workspace_size); + void* z_ptr = z.layout.ndim == 0 ? nullptr : z.raw_ptr(); + void* data_ptrs[] = {x.raw_ptr(), y.raw_ptr(), w.raw_ptr(), z_ptr, b.raw_ptr()}; + int64_t uids[] = {'x', 'y', 'w', 'z', 'b'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace.raw_ptr) + .setDataPointers(5, data_ptrs) + .setUids(5, uids) + .build(); + cudnn_check(cudnnBackendExecute( + handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cudnn_wrapper_v8.h b/dnn/src/cuda/cudnn_wrapper_v8.h new file mode 100644 index 00000000..f601ec07 --- /dev/null +++ b/dnn/src/cuda/cudnn_wrapper_v8.h @@ -0,0 +1,70 @@ +/** + * \file dnn/src/cuda/cudnn_wrapper_v8.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/basic_types.h" +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Wreorder" +#include "cudnn_frontend.h" +#pragma GCC diagnostic pop + +namespace megdnn { +namespace cuda { +static inline std::pair get_vector_count_and_dimension( + const param::Convolution::Format format) { + using Format = param::Convolution::Format; + int64_t vector_count = 1; + int64_t vector_dimension = 1; + switch (format) { + case Format::NCHW: + break; + case Format::NHWC: + vector_dimension = 3; + break; + case Format::NCHW4: + vector_count = 4; + break; + case Format::NCHW32: + vector_count = 32; + break; + default: + megdnn_assert( + false, "unsupported format (got:%u) for cudnn", + static_cast(format)); + } + return {vector_count, vector_dimension}; +} + +template +cudnn_frontend::ExecutionPlan* get_heuristic_plan_from_opr( + const Opr* opr, const TensorLayout& x, const TensorLayout& y, + const TensorLayout& w, const TensorLayout& b, const TensorLayout& z, + const typename Opr::CanonizedFilterMeta& fm); + +void run_single_conv_with_plan( + const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan, + const TensorND& x, const TensorND& y, const TensorND& w, + const Workspace& workspace); + +void run_conv_bias_act_with_plan( + const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan, + const TensorND& x, const TensorND& y, const TensorND& w, const TensorND& b, + const TensorND& z, const Workspace& workspace); + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp index 5cf5f71b..f0e314ef 100644 --- a/dnn/src/cuda/handle.cpp +++ b/dnn/src/cuda/handle.cpp @@ -58,6 +58,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); } #endif + size_t free, tot; + cudaMemGetInfo(&free, &tot); + printf("before cudnn create, free: %.2f MB, tot: %.2f MB, allocated: %.2f MB\n", + free / 1024.0 / 1024.0, tot / 1024.0 / 1024.0, + (tot - free) / 1024.0 / 1024.0); cudnn_check(cudnnCreate(&m_cudnn_handle)); cublas_check(cublasCreate(&m_cublas_handle)); #if CUDA_VERSION >= 10010 @@ -69,6 +74,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) cudnn_check(cudnnSetStream(m_cudnn_handle, stream())); cublas_check(cublasSetStream(m_cublas_handle, stream())); +#if CUDNN_VERSION >= 8004 +// cudnn_check(cudnnOpsInferVersionCheck()); +// cudnn_check(cudnnCnnInferVersionCheck()); +#endif + // Note that all cublas scalars (alpha, beta) and scalar results such as dot // output resides at device side. cublas_check(cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE)); @@ -82,6 +92,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) cudaMemcpyHostToDevice, stream())); cuda_check(cudaStreamSynchronize(stream())); + cudaMemGetInfo(&free, &tot); + printf("after cudnn create, free: %.2f MB, tot: %.2f MB, allocated: %.2f MB\n", + free / 1024.0 / 1024.0, tot / 1024.0 / 1024.0, + (tot - free) / 1024.0 / 1024.0); + // check tk1 m_is_tegra_k1 = (strcmp(m_device_prop->name, "GK20A") == 0); m_cusolver_handle = nullptr; diff --git a/dnn/test/cuda/conv_v8.cpp b/dnn/test/cuda/conv_v8.cpp new file mode 100644 index 00000000..407ec546 --- /dev/null +++ b/dnn/test/cuda/conv_v8.cpp @@ -0,0 +1,304 @@ +/** + * \file dnn/test/cuda/conv_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/dtype.h" +#include "test/cuda/fixture.h" + +#include "megdnn/opr_param_defs.h" +#include "megdnn/oprs.h" +#include "src/cuda/handle.h" +#include "test/common/benchmarker.h" +#include "test/common/checker.h" +#include "test/common/conv_bias.h" +#include "test/common/rng.h" +#include "test/common/tensor.h" +#include "test/common/workspace_wrapper.h" +#include "test/cuda/utils.h" + +using namespace megdnn; +using namespace test; +using namespace conv_bias; + +#if CUDNN_VERSION >= 8004 +TEST_F(CUDA, CONV_V8_FLOAT) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvolutionV8", {}) + .c_str()})); + + UniformFloatRNG rng(0.f, 1.f); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // group + param.sparse = param::ConvBias::Sparse::GROUP; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // NHWC + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); +} + +TEST_F(CUDA, CONV_V8_HALF) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvolutionV8", {}) + .c_str()})); + + UniformFloatRNG rng(0.f, 1.f); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()) + .set_dtype(3, dtype::Float16()) + .set_dtype(4, dtype::Float16()) + .set_epsilon(5e-2); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // group + param.sparse = param::ConvBias::Sparse::GROUP; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // NHWC + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); +} + +TEST_F(CUDA, CONV_BIAS_V8_FLOAT) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvBiasActivationV8", {}) + .c_str()})); + + UniformFloatRNG rng(0.f, 1.f); + UniformFloatRNG crng(0.f, 0.f); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // group + param.sparse = param::ConvBias::Sparse::GROUP; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // NHWC + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); +} + +TEST_F(CUDA, CONV_BIAS_V8_HALF) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvBiasActivationV8", {}) + .c_str()})); + + UniformFloatRNG rng(0.f, 1.f); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()) + .set_dtype(3, dtype::Float16()) + .set_dtype(4, dtype::Float16()) + .set_epsilon(5e-2); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // group + param.sparse = param::ConvBias::Sparse::GROUP; + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); + checker.set_param(param).execs( + {{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}}); + + // NHWC + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); +} + +TEST_F(CUDA, CONV_BIAS_V8_DP4A) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvBiasActivationV8", {}) + .c_str()})); + + UniformIntRNG rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{1.1f}) + .set_dtype(4, dtype::QuantizedS8{1.0f}) + .set_epsilon(1 + 1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW4; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{1, 16, 7, 7, 4}, {64, 16, 3, 3, 4}, {1, 16, 1, 1, 4}, {}, {}}); + checker.set_param(param).execs( + {{1, 16, 7, 7, 4}, + {64, 16, 3, 3, 4}, + {1, 16, 1, 1, 4}, + {1, 16, 7, 7, 4}, + {}}); + + param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY; + checker.set_param(param).execs( + {{1, 16, 7, 7, 4}, {64, 16, 3, 3, 4}, {1, 16, 1, 1, 4}, {}, {}}); + checker.set_param(param).execs( + {{1, 16, 7, 7, 4}, + {64, 16, 3, 3, 4}, + {1, 16, 1, 1, 4}, + {1, 16, 7, 7, 4}, + {}}); + + param.format = param::ConvBias::Format::NHWC; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {64, 3, 3, 64}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {64, 3, 3, 64}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); + param.sparse = param::ConvBias::Sparse::GROUP; + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}}); + checker.set_param(param).execs( + {{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}}); +} + +TEST_F(CUDA, CONV_BIAS_V8_IMMA) { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "CUDNN:ConvBiasActivationV8", {}) + .c_str()})); + + UniformIntRNG rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &bias_rng) + .set_rng(3, &rng) + .set_dtype(0, dtype::QuantizedS8{1.2f}) + .set_dtype(1, dtype::QuantizedS8{1.3f}) + .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) + .set_dtype(3, dtype::QuantizedS8{1.1f}) + .set_dtype(4, dtype::QuantizedS8{1.0f}) + .set_epsilon(1 + 1e-3); + param::ConvBias param; + param.pad_h = param.pad_w = 1; + param.stride_h = param.stride_w = 1; + param.format = param::ConvBias::Format::NCHW32; + param.nonlineMode = param::ConvBias::NonlineMode::RELU; + checker.set_param(param).execs( + {{1, 2, 7, 7, 32}, {64, 2, 3, 3, 32}, {1, 2, 1, 1, 32}, {}, {}}); + checker.set_param(param).execs( + {{1, 2, 7, 7, 32}, + {64, 2, 3, 3, 32}, + {1, 2, 1, 1, 32}, + {1, 2, 7, 7, 32}, + {}}); + + param.nonlineMode = NonlineMode::RELU; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 0; + + checker.set_param(param).execs( + {{2, 8, 12, 12, 32}, {512, 8, 1, 1, 32}, {1, 16, 1, 1, 32}, {}, {}}); +} + +#endif +// vim: syntax=cpp.doxygen diff --git a/third_party/prepare.sh b/third_party/prepare.sh index 5d8365bf..a2f50987 100755 --- a/third_party/prepare.sh +++ b/third_party/prepare.sh @@ -94,6 +94,7 @@ function git_submodule_update() { git submodule sync git submodule update -f --init midout git submodule update -f --init flatbuffers + git submodule update -f --init cudnn-frontend git submodule update -f --init Json git submodule update -f --init gflags git submodule update -f --init cpuinfo