diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index bd219e48..88076331 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -37,14 +37,13 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py ./$^ --type cuda $@ -../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py cutlass_generator/generator.py +../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator ./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ - ./gen_cutlass_conv_bias_kern_impls.py --type dp4a $@ python3 ./cutlass_generator/generator.py --operations all --type simt $@ -../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py +../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator ./gen_cuda_conv_bias_kern_impls.py --type imma $@ - ./gen_cutlass_conv_bias_kern_impls.py --type imma $@ + python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@ ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py ./$^ --type dp4a $@ diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 7ad77f3e..67de7770 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -807,9 +807,9 @@ void megdnn::cuda::cutlass_wrapper:: const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, int* workspace, const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, - float delta, float theta, float scale, uint8_t src_zero_point, - const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream) { + float delta, float theta, float /* scale */, + uint8_t src_zero_point, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ @@ -878,15 +878,6 @@ void megdnn::cuda::cutlass_wrapper:: 0, delta, theta}; DISPATCH_KERNEL; } - case NonlineMode::H_SWISH: { - using EpilogueOp = cutlass::epilogue::thread:: - BiasAddLinearCombinationHSwishClamp< - ElementOutput, 16, ElementAccumulator, ElementBias, - ElementCompute>; - typename EpilogueOp::Params epilogue{alpha, beta, gamma, - scale, delta, theta}; - DISPATCH_KERNEL; - } default: megdnn_assert(false, "unsupported nonlinear mode for conv bias operator"); @@ -960,8 +951,7 @@ void megdnn::cuda::cutlass_wrapper:: ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionFpropNCxHWxThreadblockSwizzle, \ - stages_, 4, aligned_, true, \ - cutlass::arch::OpMultiplyAddSaturate>; \ + stages_, 4, aligned_, true, cutlass::arch::OpMultiplyAdd>; \ typename Convolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ diff --git a/dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl deleted file mode 100644 index 53da89de..00000000 --- a/dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl +++ /dev/null @@ -1,65 +0,0 @@ -/** - * \file - * dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or - * implied. - */ -#include "cutlass/convolution/device/convolution.h" -#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" - -using namespace megdnn; -using namespace cuda; -using namespace cutlass_wrapper; - -template -void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param) { - typename Convolution::TensorRefSrc tensor_src{ - const_cast(d_src), - Convolution::LayoutSrc::packed( - {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; - typename Convolution::TensorRefFilter tensor_filter{ - const_cast(d_filter), - Convolution::LayoutFilter::packed( - {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; - typename Convolution::TensorRefBias tensor_bias{ - const_cast(d_bias), - Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; - typename Convolution::TensorRefDst tensor_z{ - const_cast(d_z), - Convolution::LayoutDst::packed( - {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; - typename Convolution::TensorRefDst tensor_dst{ - d_dst, - Convolution::LayoutDst::packed( - {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; - typename Convolution::Arguments arguments{conv_param, - tensor_src.non_const_ref(), - tensor_filter.non_const_ref(), - tensor_bias.non_const_ref(), - tensor_z.non_const_ref(), - tensor_dst.non_const_ref(), - epilogue, - {}, - {}, - extra_param}; - Convolution conv_op; - cutlass_check(conv_op.initialize(arguments, workspace)); - cutlass_check(conv_op(stream)); - after_kernel_launch(); -} - -// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu deleted file mode 100644 index 48e4a9d1..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu deleted file mode 100644 index 9c816489..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu deleted file mode 100644 index aeb5f6fa..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu deleted file mode 100644 index 5d7806f5..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu deleted file mode 100644 index 4dd3266e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu deleted file mode 100644 index 06bfa049..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu deleted file mode 100644 index c50997ef..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu deleted file mode 100644 index 53b7468b..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu deleted file mode 100644 index 84bcdacf..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu deleted file mode 100644 index 52d2af3f..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu deleted file mode 100644 index e60c5c2b..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu b/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu deleted file mode 100644 index b8fb14c6..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int4_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; -using LayoutDst = cutlass::layout::TensorNCxHWx<64>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, 16, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 32, 32, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..5ed7c5ee --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..2052984d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..26acda83 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..7bbe727c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..2367b398 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..3874374b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::int4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::int4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..755c024e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..277b3308 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..5773bc47 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu new file mode 100644 index 00000000..d9f26575 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + cutlass::int4b_t, + cutlass::layout::TensorCxRSKx<64>, + cutlass::uint4b_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::layout::TensorNCxHWx<64>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 16, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 32, + 32, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl deleted file mode 120000 index 74e039d9..00000000 --- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl +++ /dev/null @@ -1 +0,0 @@ -../implicit_gemm_conv_bias_cutlass_wrapper.cuinl \ No newline at end of file diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index 0e75dbb0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu deleted file mode 100644 index 1f0964a6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu deleted file mode 100644 index 8c863797..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index 884b223d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu deleted file mode 100644 index 9417807b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu deleted file mode 100644 index 4c9c078f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index 44f03fc7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu deleted file mode 100644 index cc6bc57d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu deleted file mode 100644 index 3d8d2b25..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu deleted file mode 100644 index 84aecf21..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu deleted file mode 100644 index d728429c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu deleted file mode 100644 index 0c8787cf..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu deleted file mode 100644 index 9bc617b8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu deleted file mode 100644 index fdf8e40d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu deleted file mode 100644 index 4318b3e6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index b8226d0a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu deleted file mode 100644 index 6f163b84..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu deleted file mode 100644 index 3c68f0b8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index f1716786..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu deleted file mode 100644 index bff1157f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu deleted file mode 100644 index 5ebb19e6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index a500504a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu deleted file mode 100644 index 77ee9d81..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu deleted file mode 100644 index 017aa325..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu deleted file mode 100644 index 2dc54353..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu deleted file mode 100644 index cfd9b9e4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu deleted file mode 100644 index c488e18e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu deleted file mode 100644 index b669dda4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu deleted file mode 100644 index 831bea3b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu deleted file mode 100644 index d6973c59..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index 2285e888..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu deleted file mode 100644 index 61c67deb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu deleted file mode 100644 index 25a94bc2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index 5b98df43..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu deleted file mode 100644 index 3621b7a8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu deleted file mode 100644 index 720777eb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index ff6434d4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu deleted file mode 100644 index 07c160fc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu deleted file mode 100644 index 498a7283..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index 63e92801..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu deleted file mode 100644 index b0b3b77a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu deleted file mode 100644 index f9c71c46..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index 792a7eca..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu deleted file mode 100644 index 8ae05467..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu deleted file mode 100644 index e476dff9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index f35955c3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu deleted file mode 100644 index 99e7e1e1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu deleted file mode 100644 index 30dd3c80..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index 5ca19b2b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu deleted file mode 100644 index 85a5439c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu deleted file mode 100644 index 62df010e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index b12d59ab..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu deleted file mode 100644 index d64826a4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu deleted file mode 100644 index 9065b5ba..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index f603e437..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu deleted file mode 100644 index 242482b6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu deleted file mode 100644 index e7ff951a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index e2a47021..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu deleted file mode 100644 index ad33ba83..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu deleted file mode 100644 index 428058f3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index 3f4143f4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu deleted file mode 100644 index 34a8fde7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu deleted file mode 100644 index d8b728d9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index 3f98d1aa..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu deleted file mode 100644 index 81be8849..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu deleted file mode 100644 index dc1db719..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index 2ac836cc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu deleted file mode 100644 index 68eed12c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu deleted file mode 100644 index ffb51a6e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index d22b597b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu deleted file mode 100644 index 470218e3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu deleted file mode 100644 index a3ce8d7f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index 22772a46..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu deleted file mode 100644 index e8906c50..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu deleted file mode 100644 index 86f615e6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index 8c114eff..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu deleted file mode 100644 index 2224882d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu deleted file mode 100644 index f2dab49c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index 4f48340b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu deleted file mode 100644 index 1a8f6edf..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu deleted file mode 100644 index b455d1ab..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index 0fe44eb4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu deleted file mode 100644 index 5a465659..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu deleted file mode 100644 index 9f61f9eb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index 7318ddb7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu deleted file mode 100644 index 88136113..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu deleted file mode 100644 index fc25ebb9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index dd295018..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu deleted file mode 100644 index e4c60065..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu deleted file mode 100644 index bd7a96f2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index e6beadd8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu deleted file mode 100644 index 492bf450..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu deleted file mode 100644 index cfd7a0aa..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index 39e5d9c9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu deleted file mode 100644 index f01634b6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu deleted file mode 100644 index 8f5f0c2d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index f45de4c4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu deleted file mode 100644 index 24cfced8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu deleted file mode 100644 index e4b0d5cc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index 94766f7e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu deleted file mode 100644 index ba550f8a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu deleted file mode 100644 index f2b1621f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index a25e67b8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu deleted file mode 100644 index 95126f54..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu deleted file mode 100644 index 5e1b4ae7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index 7d7527c1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu deleted file mode 100644 index d99f581d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu deleted file mode 100644 index 402f9289..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index 1afc7d37..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu deleted file mode 100644 index 0e04d074..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu deleted file mode 100644 index de4c6c96..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index e5152ee1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu deleted file mode 100644 index 785089c5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu deleted file mode 100644 index f37ee01a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index 1bf6bdc4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu deleted file mode 100644 index 969c6abd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu deleted file mode 100644 index 81e721ed..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index dadbe768..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu deleted file mode 100644 index 41708fe8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu deleted file mode 100644 index f40ec82f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index 8e59bc1f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu deleted file mode 100644 index 81bcc852..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu deleted file mode 100644 index 46b7f0bd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index dfd3fca3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu deleted file mode 100644 index d16971b9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu deleted file mode 100644 index 0c4b2e33..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index acf35eb5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu deleted file mode 100644 index 5de1891f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu deleted file mode 100644 index d5feb6b6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu deleted file mode 100644 index 32419ff0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu deleted file mode 100644 index 89f0c797..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu deleted file mode 100644 index 1842df45..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu deleted file mode 100644 index d38226ec..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu deleted file mode 100644 index 110cf890..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu deleted file mode 100644 index c9e3c262..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu deleted file mode 100644 index 02359e41..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu deleted file mode 100644 index e09dab8c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu deleted file mode 100644 index 20c5ab71..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu deleted file mode 100644 index a886598c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu deleted file mode 100644 index 24734b61..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu deleted file mode 100644 index 068a3483..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu deleted file mode 100644 index ccc3e424..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu deleted file mode 100644 index cc83c6f6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu deleted file mode 100644 index 6e6e2b47..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu deleted file mode 100644 index 55eb7570..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu deleted file mode 100644 index a95fa4dd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu deleted file mode 100644 index 38b55080..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; -using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, 4, 8, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu deleted file mode 100644 index 28e04a6c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu deleted file mode 100644 index eb20b124..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu deleted file mode 100644 index fe078673..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 4, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index 3920d3d0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu deleted file mode 100644 index 42094592..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu deleted file mode 100644 index 8eb832ba..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index 75cc5260..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu deleted file mode 100644 index 50d7a493..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu deleted file mode 100644 index 7736c506..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index aa1b8f53..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu deleted file mode 100644 index 060148d2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu deleted file mode 100644 index 719de8b9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index 8fd2174e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu deleted file mode 100644 index 5bee93f7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu deleted file mode 100644 index ee9f02bd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index 589e7cc9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu deleted file mode 100644 index 72f32c43..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu deleted file mode 100644 index ed025e10..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index f982327e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu deleted file mode 100644 index 1e8dd6e2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu deleted file mode 100644 index 8c2e92ec..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, false, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu deleted file mode 100644 index ef5a7283..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu deleted file mode 100644 index 5b5975e2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu deleted file mode 100644 index f9d8b8dd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu deleted file mode 100644 index a24d15e1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu deleted file mode 100644 index e9dc83e2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu deleted file mode 100644 index 7d9fa9d6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu deleted file mode 100644 index 4c29ade6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu deleted file mode 100644 index 18b67eb9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu deleted file mode 100644 index 23df9001..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu deleted file mode 100644 index c28bce2e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu deleted file mode 100644 index 18d790d3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu deleted file mode 100644 index fe8a8eab..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu deleted file mode 100644 index 791f2644..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu deleted file mode 100644 index a0cb8ba6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu deleted file mode 100644 index 1fbebd0d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu deleted file mode 100644 index a1658f73..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu deleted file mode 100644 index 1cb1f1cc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu deleted file mode 100644 index 52045ba4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; -using LayoutDst = cutlass::layout::TensorNCHW; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, 1, int32_t, float, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, float, - LayoutDst, float, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 4, 16, true, - cutlass::arch::OpMultiplyAdd>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..e4e04e84 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..228a55f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..42e7b217 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..170b469b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..14394cdb --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..1535407a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..4e41d46c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..eb779990 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..2cf18b92 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..70d1149d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..fca726f2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..6f3c2bf7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..3e7dc036 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..7aa18106 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..b1c7e2e8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..51b82ca7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..ab877a14 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..6fa4962b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..b3db80f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..b71c0446 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..a54b21f2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..e8a7ea07 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..1e2500e1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..1b139e63 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..5ce46a97 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..64669859 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..c1bf6561 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..8145c4ed --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..818d658f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..21d06921 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..4113852b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..ca6df441 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..5df08dfa --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..7ae71bac --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..a1c93fbc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..b4289ce0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..230c1530 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..931c330a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..a22f895f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..16e07bb8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..bbf08a89 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..180f9490 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..da81563f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..70db02ba --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..9cb0312b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..9dcbc0e9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..aa06ab81 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..e2ee8d5b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..b6e41082 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..1563792c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..5dfc96fa --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..f0f7aa16 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..84fab7d2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..d26dfab1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..05ab9918 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombination< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..26a9fff7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..514d6798 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..c9537b49 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu new file mode 100644 index 00000000..d3a814d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..7446a445 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..842249dd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..b955bfed --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..48ed0e5b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..36a1438a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..7bfded1c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu new file mode 100644 index 00000000..509dbff0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + float, + cutlass::layout::TensorNCHW, + float, + cutlass::layout::TensorNCHW, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, + 1, + int32_t, + float, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 4af7d5a5..5f413f7a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index f60407ca..b3e51ecf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 71218fb4..f1fe8fee 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index b69d7161..2c11c495 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index db3398a5..ae9957f3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 30a823bd..fa7eab08 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index c186f0ea..b9af9eac 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index cbb2f545..8f63112e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 4d025f49..9900626d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index d88ea361..0c48730e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 60bc548f..3ec3a99d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index a793fdef..abcbbe0f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 41eba0cb..15c6b1bd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index b6c4af9a..f9923db3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index adb9b856..cd4b2c1c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index d4b3ed4e..6ce490e1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 154f43e9..cb6a447d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 4df8e99b..4638be48 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 14c996d0..00b18304 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 32b5cc57..7d62cdee 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 3f964980..bde2cae9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index e29625af..e86ebdbf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 89a9158f..8f017402 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index aef15daa..270f05b3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index e16227c5..ad695f14 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 346ca7f6..b2d58453 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 810561c6..d1ca3656 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 803820bf..9923bb4b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index d7f9d9c3..788940cb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 68113c77..bbae6c49 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 902fdab8..65e571ff 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 65ec5576..3b30ce4d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index b51e432d..dabc15a9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..1a6b0c28 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..6a0c4006 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..23f650f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..09359814 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..1ae71af5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..c0039b40 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..b33e34fa --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..e6a7fd33 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..b615514f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..5df14d22 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..245921f0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..a5d7831e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..487b2b12 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..99f78418 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..e00d4cf5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..2af0e384 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..c85127be --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..3d675e63 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..aef4c3dc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..98aaf39a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..07f655c7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..5e4591f1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..f5db58c8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..7119977b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..0d2dce3d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..2b283b35 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..c3d3fc96 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..850ff01b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..37714634 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..3634e7f9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..4fa7e224 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..167b18ff --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..649d3ca6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..6fcfcde4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..6990d171 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..41e38dfe --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..8bb0d7a7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..d0f8a53f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..a98f7441 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..d524567f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..371ca3f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..21deb1d2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..197da7dd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..3108af79 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..e20c66b2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..681d138b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..6726c54e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..e5ebd279 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..d3777052 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..267c16d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..8fd7746d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..56bb24fe --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..adbdd0c4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..602fbd42 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..52a7d863 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..029d6c07 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..97e8c50f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..264e20d9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..c2a7d416 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..9e372498 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + false, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..cf813526 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..b7ff053a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..52e59525 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..e943d4a4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..048db4ad --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..c1188478 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..8419e723 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..e4e4cd1b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..5567c7d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..aa24cf72 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..5bce7038 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..a0e5066b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..087e8189 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..57ddadc2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..44120943 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..204ba5d0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..38ced937 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..72736484 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..d0b49333 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..b0373611 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..0992c97b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..9a37bc4b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..1354f552 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..55927c78 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..3c1f051d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..31233faf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..6adbbb06 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..fd247cf8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..9096edb5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..4f6ce00b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..4eb167ca --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..5ea27ad9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..428d6cac --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..677c5124 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..fc5ed0bc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..6d9e2533 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..acfdecd2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..47928b7a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..a077119b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..7eb3b6ef --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..67f88e60 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..e58863fa --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..2afb8753 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..246dce88 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..2a5a1643 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..f11c76bf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu new file mode 100644 index 00000000..8f591465 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu new file mode 100644 index 00000000..60a43259 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..b890da51 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..3fd2cd72 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..1e6323ed --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..f13f09ae --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu new file mode 100644 index 00000000..4fab26dd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..6a9a80b3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..f150bf32 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..545fb259 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..3592fbcf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..fd25f520 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu new file mode 100644 index 00000000..b2ca7d5c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu new file mode 100644 index 00000000..9c7813d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm61, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAdd>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 05384974..a8815802 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 9b45e053..3d2e4396 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index da56c4af..0ec64ab7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index fb31c9d5..c5f51e5d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 2f861533..d4e66f20 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index ed4ff282..7af638ac 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 5a3898bd..291f63ba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 0d0c034a..4782581d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 29a4bc98..504bcdff 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 358017ff..427608a9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index f913d6a0..4e10da32 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 65bf4569..0761465a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 35a155c0..38789e9f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index b778d11f..1b34206f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index d25af069..79a0ebbf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 5edf953d..e527fc27 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index c52e7751..1fd9b456 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 153e6713..c9e707cc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 5e3ec931..3ce84e92 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index fbbc7b8e..c4fe56c7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index efbd0b39..72ca8757 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 6bd54281..7e44eb40 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index b417533b..26ca4d5c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 23bfd1d2..a3cdef8b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index a0ea06e1..3be02f58 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index bfaab36c..b27c4b29 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 072804b6..43d7beeb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index b5c634d6..893c0afa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 9ccac5c9..afc5ccc6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index a60595a3..b5bbead4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 6cf4781c..19d24977 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index affc60c8..3dffec88 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 7cb844de..9d025a4e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -37,7 +37,7 @@ using Convolution = 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu deleted file mode 100644 index 782717f5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu deleted file mode 100644 index 82beee09..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu deleted file mode 100644 index daad306d..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu deleted file mode 100644 index 0f33a6ea..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu deleted file mode 100644 index 0c941294..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu deleted file mode 100644 index 8ebe8a25..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu deleted file mode 100644 index d5e3ff9e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu deleted file mode 100644 index 3b5ca9cf..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu deleted file mode 100644 index 04b23694..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu deleted file mode 100644 index fe3a0ecd..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu deleted file mode 100644 index 50ce1717..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu deleted file mode 100644 index af2c603c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu deleted file mode 100644 index b3856194..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu deleted file mode 100644 index c3ae319f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu deleted file mode 100644 index 0c2aeef1..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu deleted file mode 100644 index 5d763bf7..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu deleted file mode 100644 index 55b4bf1a..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu deleted file mode 100644 index a2324d7e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu deleted file mode 100644 index 188100ea..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu deleted file mode 100644 index ae13454f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu deleted file mode 100644 index 50c922ed..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu deleted file mode 100644 index 18a28e77..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu deleted file mode 100644 index 81aaf985..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu deleted file mode 100644 index a5a28e66..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu deleted file mode 100644 index 4f392700..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu deleted file mode 100644 index d57be3e4..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu deleted file mode 100644 index 986c50ea..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu deleted file mode 100644 index c7fc2eb9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu deleted file mode 100644 index 9130efa7..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu deleted file mode 100644 index f5f7935f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu deleted file mode 100644 index c7965a5e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu deleted file mode 100644 index f59e303f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu deleted file mode 100644 index 714c2eef..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu deleted file mode 100644 index 15da7001..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu deleted file mode 100644 index 71d9935c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu deleted file mode 100644 index a8c52b77..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu deleted file mode 100644 index 12690457..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu deleted file mode 100644 index ef3af676..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu deleted file mode 100644 index 85cb8d05..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu deleted file mode 100644 index a43c9740..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu deleted file mode 100644 index d8e05bcf..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu deleted file mode 100644 index 928381b9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<32>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu deleted file mode 100644 index 2d7b2e6b..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu deleted file mode 100644 index fcc0ee3c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu deleted file mode 100644 index c301395e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu deleted file mode 100644 index 38562844..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu deleted file mode 100644 index eebfa8f7..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu deleted file mode 100644 index 7e9db5d5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu deleted file mode 100644 index bf2267b9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu deleted file mode 100644 index 88047e92..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu deleted file mode 100644 index 05a68af2..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu deleted file mode 100644 index 0fdb08eb..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu deleted file mode 100644 index f7737015..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu deleted file mode 100644 index d0ec979c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu deleted file mode 100644 index 19ef5ff0..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu deleted file mode 100644 index 512b1cf3..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu deleted file mode 100644 index 34ac69ef..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu deleted file mode 100644 index 2b021b30..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu deleted file mode 100644 index 1bf793ff..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu deleted file mode 100644 index 3d3fb71d..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu deleted file mode 100644 index d3ad572e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu deleted file mode 100644 index 7382553f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu deleted file mode 100644 index a4160f2d..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu deleted file mode 100644 index 81481ab3..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu deleted file mode 100644 index 73d90af2..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu deleted file mode 100644 index 6312d23c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu deleted file mode 100644 index 13b7bb4e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu deleted file mode 100644 index ab533076..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu deleted file mode 100644 index e47929cc..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu deleted file mode 100644 index 02c45ed4..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu deleted file mode 100644 index b20ae9a5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu deleted file mode 100644 index 04419993..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu deleted file mode 100644 index 69f2369b..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu deleted file mode 100644 index 936cfd40..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu deleted file mode 100644 index 7b54ec47..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu deleted file mode 100644 index eb38598e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu deleted file mode 100644 index 8bb5a17d..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu deleted file mode 100644 index 335442f6..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu deleted file mode 100644 index f4bf5611..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu deleted file mode 100644 index e85cb26f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu deleted file mode 100644 index f8ce9147..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu deleted file mode 100644 index a91494e5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu deleted file mode 100644 index 153c1bdf..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu deleted file mode 100644 index 1518e4b6..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu +++ /dev/null @@ -1,36 +0,0 @@ -#if !MEGDNN_TEGRA_X1 -// generated by gen_cuda_conv_bias_int8_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" - -using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; -using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; -using LayoutDst = cutlass::layout::TensorNCxHWx<4>; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; -using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; -using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::conv::device::Convolution< - int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, 16, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, typename Convolution::ExtraParam extra_param); -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..167422bf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..14a01213 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..5cfa3b57 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..74c16e57 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..0454cb47 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..638946d6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..a21d6ad3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..72ed7bed --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..2e5ab843 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..be006ff8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..2f7c8265 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..4b05fa6e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..0c54c5e9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..0662b46a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..b8421162 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..ae73171e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..03f894cd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..b5379a2f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..5d6380f1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..3ea95e5f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..bfbcfc00 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..b688db3c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..63f14470 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..08f6849b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..6f835a47 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..366677b4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..cc2b54cf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..82de0bee --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..01f87212 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..78df98c3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..400a2ffe --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..f327f180 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..825c44f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..0c52462d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..f835d111 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..e2ee3f2b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..83644d87 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..462f9a75 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..d778c9a0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..50a93f12 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..be55de21 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..cbeb9445 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..fb1d5c21 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..381141f5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..3f425729 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..e78bddfc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..59091861 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..810e07f2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..d18b4975 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..3faea271 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..3d8cd19a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..71c6b1d5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..72ba2cc6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..9bc72ab2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..d0d5c250 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..105e4477 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..9dd98512 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..cdbbc226 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..3abb44da --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..dbf50af0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..0f86dd24 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..cd261416 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..1d3ac372 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..5d7c53b4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..fb7572d1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..29198b66 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..28add755 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..76492138 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..720b1d78 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..cb6f23ad --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..0b576a2d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..a49609d6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..8eb24cd9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..ce73b2c3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..15407bb8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..727839b6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..3c0428f2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..14f1f1e9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..141f4e6c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu new file mode 100644 index 00000000..ae6ce3b8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<32, 16, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu new file mode 100644 index 00000000..dabb3d06 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..1d7ad4cc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu new file mode 100644 index 00000000..557459c5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::layout::TensorNCxHWx<32>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu new file mode 100644 index 00000000..be153e83 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<32>, + int8_t, + cutlass::layout::TensorCxRSKx<32>, + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::layout::TensorNCxHWx<4>, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, + 4, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 16, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu index 296c397a..d3d701ea 100644 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu @@ -21,7 +21,7 @@ using Deconvolution = cutlass::conv::device::Deconvolution< ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, 1, 4, 8, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( const typename Deconvolution::ElementSrc* d_src, const typename Deconvolution::ElementFilter* d_filter, diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu index 57730346..934d3264 100644 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu @@ -21,7 +21,7 @@ using Deconvolution = cutlass::conv::device::Deconvolution< ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, 2, 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( const typename Deconvolution::ElementSrc* d_src, const typename Deconvolution::ElementFilter* d_filter, diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu index a22525cd..28512b49 100644 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu @@ -21,7 +21,7 @@ using Deconvolution = cutlass::conv::device::Deconvolution< ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, 2, 4, 4, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( const typename Deconvolution::ElementSrc* d_src, const typename Deconvolution::ElementFilter* d_filter, diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu index 400b5db2..3fdf2832 100644 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu @@ -21,7 +21,7 @@ using Deconvolution = cutlass::conv::device::Deconvolution< ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( const typename Deconvolution::ElementSrc* d_src, const typename Deconvolution::ElementFilter* d_filter, diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu index c149a8e1..20ba90c3 100644 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu @@ -21,7 +21,7 @@ using Deconvolution = cutlass::conv::device::Deconvolution< ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, 2, 4, 16, true, - cutlass::arch::OpMultiplyAddSaturate>; + cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( const typename Deconvolution::ElementSrc* d_src, const typename Deconvolution::ElementFilter* d_filter,