diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index 88076331..e852e7b4 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -37,21 +37,21 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py ./$^ --type cuda $@ -../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator +../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator/generator.py ./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ - python3 ./cutlass_generator/generator.py --operations all --type simt $@ + python3 ./cutlass_generator/generator.py --operations conv2d --type simt $@ -../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator +../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator/generator.py ./gen_cuda_conv_bias_kern_impls.py --type imma $@ python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@ ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py ./$^ --type dp4a $@ -../src/cuda/matrix_mul/fp32_simt/kimpl: gen_cutlass_matmul_kern_impls.py - ./$^ $@ +../src/cuda/matrix_mul/fp32_simt/kimpl: cutlass_generator/generator.py + python3 ./cutlass_generator/generator.py --operations gemm --type simt $@ -../src/cuda/matrix_mul/fp32_simt_gemv/kimpl: gen_cutlass_gemv_batched_strided_kern_impls.py - ./$^ $@ +../src/cuda/matrix_mul/fp32_simt_gemv/kimpl: cutlass_generator + python3 ./cutlass_generator/generator.py --operations gemv --type simt $@ .PHONY: all diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index e4e04e84..828ad12a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index 228a55f3..5fb1087f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 42e7b217..60ed11c6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index 170b469b..c6b29e41 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 14394cdb..57c1f91a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 1535407a..f97f8c72 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 4e41d46c..c9346d31 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index eb779990..82895299 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 2cf18b92..152a17be 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index 70d1149d..eab74ead 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index fca726f2..c05e4e9d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 6f3c2bf7..e5e41b33 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index 3e7dc036..35c48f8e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 7aa18106..50b324d0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index b1c7e2e8..8e749a6c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 51b82ca7..a81e5ae8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index ab877a14..66bfdd0a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 6fa4962b..0e0f2263 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index b3db80f3..f06c69bd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index b71c0446..7f43a2c2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index a54b21f2..95ef3609 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index e8a7ea07..1e2a14b8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 1e2500e1..7106d1f7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index 1b139e63..ecf89ca6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index 5ce46a97..c1d06a19 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index 64669859..eeb92cda 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index c1bf6561..defd1f2d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 8145c4ed..79f65f6c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 818d658f..cff3a8df 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index 21d06921..0b56bcf5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 4113852b..e4d88dd9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index ca6df441..ea872272 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 5df08dfa..6ab659be 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 7ae71bac..742ed499 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index a1c93fbc..049b42c3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index b4289ce0..79b40aad 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index 230c1530..3072bdb4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 931c330a..842ca304 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index a22f895f..d2c18579 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 16e07bb8..831e696c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index bbf08a89..302935c5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 180f9490..26c6ada9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index da81563f..3b419be3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 70db02ba..e99a6852 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 9cb0312b..a4fad1bf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index 9dcbc0e9..e3521828 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index aa06ab81..fbf8c3b9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index e2ee8d5b..73cd3839 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index b6e41082..b3d47b02 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 1563792c..2f26acf8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index 5dfc96fa..7b9219be 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index f0f7aa16..e5b3eb1e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 84fab7d2..2cd897a8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index d26dfab1..29e25395 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 05ab9918..386de077 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu index 26a9fff7..27bdaeb5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu index 514d6798..f3f73e94 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu index c9537b49..2bbee6fd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu index d3a814d8..ba873407 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu index 7446a445..09082101 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu index 842249dd..84b1ffba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu index b955bfed..28a74223 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu index 48ed0e5b..fafc7a18 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu index 36a1438a..8e1835f8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu index 7bfded1c..a5f3721e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu index 509dbff0..1401390e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nchw.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 5f413f7a..41d64cb9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index b3e51ecf..a41c0294 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index f1fe8fee..566dc284 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 2c11c495..2467cc4e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index ae9957f3..c4d1697e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index fa7eab08..0514f278 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index b9af9eac..a86e72b1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 8f63112e..ae88eaf2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 9900626d..ab6c8b00 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 0c48730e..3ff004f9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 3ec3a99d..de94428e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index abcbbe0f..86859c9a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 15c6b1bd..724f1026 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index f9923db3..7b28b0d5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index cd4b2c1c..2f5bfcfc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 6ce490e1..e5e7a144 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index cb6a447d..089cb208 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 4638be48..1bd0a97f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 00b18304..4a5a33fc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 7d62cdee..c20d5847 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index bde2cae9..747219ed 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index e86ebdbf..a7b322a5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 8f017402..fe01d0b8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 270f05b3..1737acc2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index ad695f14..e8f671ce 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index b2d58453..d4bcd815 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index d1ca3656..a925dde6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 9923bb4b..fe55edfe 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 788940cb..9f5e15d5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index bbae6c49..03b3dd1e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 65e571ff..df33dc24 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 3b30ce4d..2cd0a451 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index dabc15a9..c92680aa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu index 1a6b0c28..eb5f8857 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 6a0c4006..119ca59e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu index 23f650f3..f06f8f31 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 09359814..ec713ff7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu index 1ae71af5..9affaa3c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index c0039b40..92afe974 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu index b33e34fa..9e331632 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu index e6a7fd33..026d72a9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu index b615514f..f9311df3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 5df14d22..65ae55c5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu index 245921f0..d9267ca2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index a5d7831e..cb957cd3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu index 487b2b12..53f215f1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 99f78418..c865ecbc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu index e00d4cf5..a6fcb3eb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 2af0e384..0008c79f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu index c85127be..36d4ab6e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 3d675e63..707e4e72 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu index aef4c3dc..77b89b03 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 98aaf39a..287d21c2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu index 07f655c7..cd828293 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 5e4591f1..06575697 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu index f5db58c8..27f290f8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 7119977b..d3cfd4b3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu index 0d2dce3d..5da3327b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 2b283b35..869fac22 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu index c3d3fc96..a8db4021 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu index 850ff01b..c0c23080 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu index 37714634..9271b3b3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 3634e7f9..0e37f1c7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu index 4fa7e224..8d5d9743 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index 167b18ff..ad4aca7b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu index 649d3ca6..27263d12 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 6fcfcde4..f1b452ea 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu index 6990d171..859887dd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 41e38dfe..a377250d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu index 8bb0d7a7..7b259d59 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index d0f8a53f..73a472e7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu index a98f7441..8e9ad219 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index d524567f..9a22a896 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu index 371ca3f3..948d86fa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 21deb1d2..d03293a2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu index 197da7dd..54c059ad 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 3108af79..0ae92d4f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu index e20c66b2..abe5e521 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 681d138b..c1b497ac 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu index 6726c54e..cfd0926b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu index e5ebd279..bb989bf7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu index d3777052..c48fb744 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 267c16d8..68f9d8cc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu index 8fd7746d..90a17d4d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index 56bb24fe..b4094622 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu index adbdd0c4..05e3e2b6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 602fbd42..ba2ef184 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu index 52a7d863..ed2e4f27 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 029d6c07..e2141044 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu index 97e8c50f..06e5b01f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 264e20d9..9b5a5b06 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu index c2a7d416..4fde39a8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 9e372498..8b13b712 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu index cf813526..8b8f3ac4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index b7ff053a..c4a85c47 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu index 52e59525..971d5016 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index e943d4a4..5d5fb2c6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu index 048db4ad..ee0c2a2e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index c1188478..26d1bd97 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu index 8419e723..68585a3a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu index e4e4cd1b..6111f919 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu index 5567c7d8..68794f6b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index aa24cf72..16e415f6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu index 5bce7038..01b92bc1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index a0e5066b..d0620ca0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu index 087e8189..5cdcce09 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 57ddadc2..a633169a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu index 44120943..9e7ae5fd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 204ba5d0..5897f989 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu index 38ced937..9c856339 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 72736484..3277092d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu index d0b49333..a1f282e9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index b0373611..ba6e5045 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu index 0992c97b..812274a7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 9a37bc4b..d37d0edb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu index 1354f552..837f1704 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 55927c78..f7856b22 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu index 3c1f051d..341fd2e5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 31233faf..a318b38c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu index 6adbbb06..8e017652 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu index fd247cf8..79e3d032 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu index 9096edb5..00847daa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 4f6ce00b..a9aa737e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu index 4eb167ca..e3c88e3d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index 5ea27ad9..ecaa42c0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu index 428d6cac..9a6c225d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 677c5124..9eab2b78 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu index fc5ed0bc..67eed08e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 6d9e2533..125506b0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu index acfdecd2..c81ee566 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 47928b7a..bbc3e062 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu index a077119b..3b04adaa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 7eb3b6ef..54730939 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu index 67f88e60..a639f5bd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index e58863fa..043533be 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu index 2afb8753..c1f70a43 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index 246dce88..5ad99300 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu index 2a5a1643..6968d4fc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index f11c76bf..2bfdea16 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu index 8f591465..c458632a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu index 60a43259..f1023cbd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu index b890da51..26af7b6c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu index 3fd2cd72..baf6788e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu index 1e6323ed..8651eeaf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu index f13f09ae..ab66f277 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu index 4fab26dd..1adccb33 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu index 6a9a80b3..cfbc9dc7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu index f150bf32..86073427 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu index 545fb259..624df6b1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu index 3592fbcf..9ce8d067 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu index fd25f520..dd78af30 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu index b2ca7d5c..7f2d171b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu index 9c7813d8..408169dd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index a8815802..c1f9f0cc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 3d2e4396..42cce868 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 0ec64ab7..ce2faaf9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index c5f51e5d..a8b5ef0f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index d4e66f20..0ba74f4c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 7af638ac..61e4ebae 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index 291f63ba..a01dc2b9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 4782581d..968403a5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 504bcdff..65b8157f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 427608a9..0742f4c7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 4e10da32..9641af3e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 0761465a..ad8152a1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 38789e9f..e2612b5d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 1b34206f..1b168e3d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 79a0ebbf..13cca83e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index e527fc27..d93d7191 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 1fd9b456..031ad7e8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index c9e707cc..082e7f9c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 3ce84e92..c5c70528 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index c4fe56c7..01bd0597 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 72ca8757..d1643f9a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 7e44eb40..40231a90 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 26ca4d5c..11cd5722 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index a3cdef8b..50e30b60 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 3be02f58..8ae67566 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index b27c4b29..73fed81f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 43d7beeb..cc8985f3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 893c0afa..882ea15c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index afc5ccc6..10464380 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index b5bbead4..768771c2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 19d24977..f09590a2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 3dffec88..57fbd2f3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 9d025a4e..afbdbbbb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAdd>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu index 167422bf..8c2e425a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 14a01213..db1c950c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu index 5cfa3b57..99e63380 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index 74c16e57..9b646f5a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu index 0454cb47..f2777718 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 638946d6..52bf3538 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu index a21d6ad3..6adce813 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 72ed7bed..94d14423 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 2e5ab843..9127e532 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu index be006ff8..1089e31f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu index 2f7c8265..676de64e 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 4b05fa6e..ac1ea8b0 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu index 0c54c5e9..be1cd355 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index 0662b46a..58b209be 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu index b8421162..0dd9d49a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index ae73171e..7a9c681b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu index 03f894cd..264516f3 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index b5379a2f..5c7d5a81 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu index 5d6380f1..f43d76a6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 3ea95e5f..5981be60 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu index bfbcfc00..051fe0aa 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index b688db3c..61c20863 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 63f14470..94c98d8a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu index 08f6849b..259fa2b4 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu index 6f835a47..fdb3e383 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 366677b4..f499f162 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu index cc2b54cf..4b5ce90c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index 82de0bee..2ea46668 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu index 01f87212..57a858e8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 78df98c3..24be9280 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu index 400a2ffe..4861c0ba 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index f327f180..5f5efcc6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu index 825c44f3..a71bf73a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 0c52462d..bc85e8e9 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu index f835d111..7f6d69a5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index e2ee3f2b..bd99ad62 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 83644d87..1cde8955 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu index 462f9a75..e9c07ff1 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu index d778c9a0..e5792726 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 50a93f12..2f6b5d1a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu index be55de21..f216358a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index cbeb9445..8cd3ff76 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu index fb1d5c21..f1476e65 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 381141f5..36c0437d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu index 3f425729..954f44e8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index e78bddfc..3bbe65d5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu index 59091861..e9ab71c6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 810e07f2..47ff8c28 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu index d18b4975..f6a8fea0 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 3faea271..be9d5f00 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 3d8cd19a..7f9b454f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu index 71c6b1d5..e49d4672 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu index 72ba2cc6..ed2411d3 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 9bc72ab2..36bd53e1 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu index d0d5c250..bf0f2d37 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index 105e4477..a5463e79 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu index 9dd98512..2f7395fe 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index cdbbc226..3f3d43b8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu index 3abb44da..cfc62fa6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index dbf50af0..3a6c6eb1 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu index 0f86dd24..1599f6d7 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index cd261416..6aefb106 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu index 1d3ac372..54a259e7 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 5d7c53b4..7b798cd2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index fb7572d1..5119b75f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu index 29198b66..f07ee7af 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu index 28add755..a86754dc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 76492138..07d3a81d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu index 720b1d78..b32291d8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index cb6f23ad..d5fd8891 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu index 0b576a2d..a0d811bf 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index a49609d6..d0e151da 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu index 8eb24cd9..4f7a3e28 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu index ce73b2c3..b5dff9fb 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu index 15407bb8..650b53b7 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu index 727839b6..a6c1709c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu index 3c0428f2..be070f8b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu index 14f1f1e9..0e0f5c4c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu index 141f4e6c..b3bba802 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu index ae6ce3b8..fff4773f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu index dabb3d06..32455afb 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu index 1d7ad4cc..169f7179 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu index 557459c5..50bb8921 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu index be153e83..7c9302c9 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_nc4hw4.cu @@ -40,16 +40,19 @@ using Convolution = cutlass::arch::OpMultiplyAddSaturate>; + template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); + + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl similarity index 100% rename from dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl rename to dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl similarity index 100% rename from dnn/src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl rename to dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu new file mode 100644 index 00000000..8835d03e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu new file mode 100644 index 00000000..d80e3da0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu new file mode 100644 index 00000000..90615451 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu new file mode 100644 index 00000000..e425c9ee --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu new file mode 100644 index 00000000..7ccda810 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu new file mode 100644 index 00000000..5bbe80ea --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu new file mode 100644 index 00000000..0d90cfc0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu new file mode 100644 index 00000000..7ac7f3ec --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu new file mode 100644 index 00000000..a8dfa509 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu new file mode 100644 index 00000000..f9c45771 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu new file mode 100644 index 00000000..2cfeb57c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu new file mode 100644 index 00000000..09e1459b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_128x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu new file mode 100644 index 00000000..3c3f2e16 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu new file mode 100644 index 00000000..960c221e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu new file mode 100644 index 00000000..b7a7c1ee --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu new file mode 100644 index 00000000..17273792 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu new file mode 100644 index 00000000..0a0119f2 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu new file mode 100644 index 00000000..37c8eaa6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu new file mode 100644 index 00000000..9bc2438d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu new file mode 100644 index 00000000..fe2c5482 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu new file mode 100644 index 00000000..7a3efade --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu new file mode 100644 index 00000000..53e32ead --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu new file mode 100644 index 00000000..cf9dd810 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu new file mode 100644 index 00000000..53b44096 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_16x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu new file mode 100644 index 00000000..d88caa76 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu new file mode 100644 index 00000000..28652ab9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu new file mode 100644 index 00000000..04738c31 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu new file mode 100644 index 00000000..fe4c6356 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu new file mode 100644 index 00000000..4f43a3cc --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu new file mode 100644 index 00000000..ba1354c9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu new file mode 100644 index 00000000..70fbb154 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu new file mode 100644 index 00000000..81481800 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_256x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu new file mode 100644 index 00000000..a98ac691 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu new file mode 100644 index 00000000..69004b21 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu new file mode 100644 index 00000000..28926368 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu new file mode 100644 index 00000000..f9757db1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu new file mode 100644 index 00000000..56af657e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x256_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu new file mode 100644 index 00000000..46110113 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x256_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu new file mode 100644 index 00000000..2fde5da8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x256_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu new file mode 100644 index 00000000..095eeaaf --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x256_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu new file mode 100644 index 00000000..a6cd1554 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu new file mode 100644 index 00000000..334ee1d8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu new file mode 100644 index 00000000..572103fd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu new file mode 100644 index 00000000..ca174a9a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu new file mode 100644 index 00000000..42d2171a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu new file mode 100644 index 00000000..7efcd307 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu new file mode 100644 index 00000000..faa46fb3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu new file mode 100644 index 00000000..b92ea109 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_32x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu new file mode 100644 index 00000000..c894ca7c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu new file mode 100644 index 00000000..88cfabd3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu new file mode 100644 index 00000000..68b69eab --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu new file mode 100644 index 00000000..b1ff3810 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu new file mode 100644 index 00000000..d967f2f5 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x256_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu new file mode 100644 index 00000000..0da5a24b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x256_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu new file mode 100644 index 00000000..5afa52ed --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x256_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu new file mode 100644 index 00000000..55537c28 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x256_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu new file mode 100644 index 00000000..9967f9ac --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu new file mode 100644 index 00000000..8622d27a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu new file mode 100644 index 00000000..64fd370f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu new file mode 100644 index 00000000..91c8529c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu new file mode 100644 index 00000000..552e3070 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu new file mode 100644 index 00000000..af861cdd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu new file mode 100644 index 00000000..8a51ef22 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu new file mode 100644 index 00000000..a17672ef --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_64x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu new file mode 100644 index 00000000..c4b8ac1d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_8x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu new file mode 100644 index 00000000..d2d55fdf --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_8x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu new file mode 100644 index 00000000..f311d75c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_8x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu new file mode 100644 index 00000000..00c1c9c9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu @@ -0,0 +1,49 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_8x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + 1, + 1, + false, + cutlass::arch::OpMultiplyAdd + + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu new file mode 100644 index 00000000..77eb01ca --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu new file mode 100644 index 00000000..f5fa4f0a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu new file mode 100644 index 00000000..3ac6a2bd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu new file mode 100644 index 00000000..abac3e7e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu new file mode 100644 index 00000000..672d2f43 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu new file mode 100644 index 00000000..876263f4 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu new file mode 100644 index 00000000..d338c160 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu new file mode 100644 index 00000000..7f0ab852 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu new file mode 100644 index 00000000..1d8d9ff1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu new file mode 100644 index 00000000..01a16c59 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu new file mode 100644 index 00000000..8a53c72a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu new file mode 100644 index 00000000..a175bd7d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu new file mode 100644 index 00000000..898008db --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu new file mode 100644 index 00000000..c4dec94b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu new file mode 100644 index 00000000..ef795a29 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu new file mode 100644 index 00000000..f8826de6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu new file mode 100644 index 00000000..76338c38 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu new file mode 100644 index 00000000..291301bd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu new file mode 100644 index 00000000..3aec7e52 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu new file mode 100644 index 00000000..722a3e56 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu new file mode 100644 index 00000000..1113e8ae --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu new file mode 100644 index 00000000..d7217f76 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu new file mode 100644 index 00000000..427216e2 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu new file mode 100644 index 00000000..4dce8045 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu new file mode 100644 index 00000000..ef2be0d9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu new file mode 100644 index 00000000..fee41926 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu new file mode 100644 index 00000000..f0c20a9c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu new file mode 100644 index 00000000..af24f798 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 32, 8>, + cutlass::gemm::GemmShape<64, 16, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu new file mode 100644 index 00000000..abb555d9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu new file mode 100644 index 00000000..9b74fccd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu new file mode 100644 index 00000000..6ea860a7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu new file mode 100644 index 00000000..d0c41a87 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<256, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu new file mode 100644 index 00000000..02df0200 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu new file mode 100644 index 00000000..f351dc22 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu new file mode 100644 index 00000000..07e3e2f0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu new file mode 100644 index 00000000..e4f10562 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu new file mode 100644 index 00000000..d95f6269 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu new file mode 100644 index 00000000..2b5c3f46 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu new file mode 100644 index 00000000..f9d2760e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu new file mode 100644 index 00000000..e8acc8d1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 256, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu new file mode 100644 index 00000000..a87a1c72 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu new file mode 100644 index 00000000..a088516a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu new file mode 100644 index 00000000..ca317b8b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu new file mode 100644 index 00000000..2b90bf0e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu new file mode 100644 index 00000000..dcff6500 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu new file mode 100644 index 00000000..8a6a092d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu new file mode 100644 index 00000000..617de1e6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu new file mode 100644 index 00000000..dd2d4ba0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu new file mode 100644 index 00000000..7eb8fe18 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu new file mode 100644 index 00000000..fd4f4415 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu new file mode 100644 index 00000000..1b5412a8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu new file mode 100644 index 00000000..d2f6f7a8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu new file mode 100644 index 00000000..7b169008 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu new file mode 100644 index 00000000..653bcd12 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu new file mode 100644 index 00000000..3e62fcdf --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu new file mode 100644 index 00000000..ef6b2922 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 256, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu new file mode 100644 index 00000000..1a112197 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu new file mode 100644 index 00000000..c034e358 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu new file mode 100644 index 00000000..7138361d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu new file mode 100644 index 00000000..b783dc8f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu new file mode 100644 index 00000000..2dc0c571 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu new file mode 100644 index 00000000..984256e6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu new file mode 100644 index 00000000..f5b8e9b1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu new file mode 100644 index 00000000..ed003053 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu new file mode 100644 index 00000000..51d3e881 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu new file mode 100644 index 00000000..b0c00d7c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu new file mode 100644 index 00000000..7ef5c6f2 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu new file mode 100644 index 00000000..494c8af1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu @@ -0,0 +1,42 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" + + + // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1 + using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<8, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + float, + 1, + float, + float + > + >; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementA* d_A, size_t lda, + const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementB* d_B, size_t ldb, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu deleted file mode 100644 index 38284233..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index a4b4e0c4..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu deleted file mode 100644 index d69e3359..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index e78d64a9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu deleted file mode 100644 index 0fe5a161..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 964ef525..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu deleted file mode 100644 index 374a8d73..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index d5795b04..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu deleted file mode 100644 index 87d80fd4..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 74b75ebd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu deleted file mode 100644 index 230297e8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 30dd6ad0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu deleted file mode 100644 index 04f80d33..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu deleted file mode 100644 index d702c06b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu deleted file mode 100644 index 70fd338f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index a54b66d8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu deleted file mode 100644 index bed908a1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 91c76b87..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu deleted file mode 100644 index a063706f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 161b9e55..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu deleted file mode 100644 index 6eae3c18..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu deleted file mode 100644 index c8968eb3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu deleted file mode 100644 index 3107bc36..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 22f13797..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu deleted file mode 100644 index ce92e149..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 5c51f781..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu deleted file mode 100644 index 50f5e49f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 9ccf1190..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu deleted file mode 100644 index 28b32c91..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu deleted file mode 100644 index e25e44e3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu deleted file mode 100644 index 4e1a9f6c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu deleted file mode 100644 index f7f1fb69..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu deleted file mode 100644 index 225cdf3b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 0050f669..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu deleted file mode 100644 index 91c830c6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu deleted file mode 100644 index c2fe7bcb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu deleted file mode 100644 index e3ba197f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu deleted file mode 100644 index fff368ea..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu deleted file mode 100644 index 9e41f582..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 511cd557..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu deleted file mode 100644 index 49de5607..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 07296250..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu deleted file mode 100644 index 872b8ded..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu deleted file mode 100644 index c7774d64..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu deleted file mode 100644 index a6178562..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu deleted file mode 100644 index fccd72b0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu deleted file mode 100644 index e5c3e2d2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu deleted file mode 100644 index b2fa6309..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu deleted file mode 100644 index a85bae3d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu deleted file mode 100644 index cb855f55..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu deleted file mode 100644 index 4d8cddb5..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu deleted file mode 100644 index ca5408f4..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu deleted file mode 100644 index 7880c3cb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu deleted file mode 100644 index 6a77f8c7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu deleted file mode 100644 index 6e396c45..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu deleted file mode 100644 index a3a9ba6c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu deleted file mode 100644 index 7e4b278b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index 05437d7d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu deleted file mode 100644 index 61f578ad..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 55eac3eb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu deleted file mode 100644 index 0227b521..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu deleted file mode 100644 index 5a777e87..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu deleted file mode 100644 index 90a24e98..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 3b268760..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu deleted file mode 100644 index ccb3a6fb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 57f71457..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu deleted file mode 100644 index 296e163d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 7d9dae19..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu deleted file mode 100644 index c964aaf8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index c6be5d7b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu deleted file mode 100644 index 9bb8ea8a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index d5f9afb3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu deleted file mode 100644 index 18047dc2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 7a66c163..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu deleted file mode 100644 index 86899145..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 78c0283e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu deleted file mode 100644 index c65df06e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu deleted file mode 100644 index fcb716b7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu deleted file mode 100644 index ecd87aa3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu deleted file mode 100644 index 0afda5f1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu deleted file mode 100644 index ebb5a2f3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu deleted file mode 100644 index a678b28c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu deleted file mode 100644 index f330b6d7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 44e8a1b1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu deleted file mode 100644 index db6e22b9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu deleted file mode 100644 index 55cee82c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu deleted file mode 100644 index 161e1337..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu deleted file mode 100644 index 9269ac05..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu deleted file mode 100644 index 929bcdc6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 71aa87a1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu deleted file mode 100644 index cf467004..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 88f5c826..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu deleted file mode 100644 index 9fea5074..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 86c8a6e0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu deleted file mode 100644 index 9976be6b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index b452d8ed..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu deleted file mode 100644 index 32175db5..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index a19eb570..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu deleted file mode 100644 index 1ead9917..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 71c9cab8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu deleted file mode 100644 index 919aad63..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 541af533..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu deleted file mode 100644 index f4928b1e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index 593a73b6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu deleted file mode 100644 index 2ff883c0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index c081e366..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu deleted file mode 100644 index f6f214de..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index 997a5b23..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu deleted file mode 100644 index 67a7c764..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index f918819a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu deleted file mode 100644 index a27a779c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index 5d8c09c7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu deleted file mode 100644 index 48bd2b06..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu deleted file mode 100644 index c81cadd1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu deleted file mode 100644 index 465b736e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu deleted file mode 100644 index e31365d5..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu deleted file mode 100644 index 023bd242..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu deleted file mode 100644 index 5b89b5f6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu deleted file mode 100644 index 927984eb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu deleted file mode 100644 index f4638ce1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu deleted file mode 100644 index 2bef3b7d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu deleted file mode 100644 index 1ed27985..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu deleted file mode 100644 index 576d6663..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu deleted file mode 100644 index a600832e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu deleted file mode 100644 index c4414a1c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu deleted file mode 100644 index 7f48f96f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu deleted file mode 100644 index a4831f90..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu deleted file mode 100644 index ac2dea60..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; -using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu deleted file mode 100644 index ce7de93b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu deleted file mode 100644 index b8319cd7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu deleted file mode 100644 index 254272c0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu deleted file mode 100644 index 0b3cdc14..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::RowMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu deleted file mode 100644 index ea150057..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu deleted file mode 100644 index bd15a9c3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::RowMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu deleted file mode 100644 index 58b093d4..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu +++ /dev/null @@ -1,35 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::Gemm< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - 2>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu deleted file mode 100644 index e4a2e18e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu +++ /dev/null @@ -1,33 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_matrix_mul_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" - -using LayoutA = cutlass::layout::ColumnMajor; -using LayoutB = cutlass::layout::ColumnMajor; -using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; -using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; -using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; -using EpilogueOp = cutlass::epilogue::thread::LinearCombination; -using Gemm = cutlass::gemm::device::GemmSplitKParallel< - float, LayoutA, - float, LayoutB, - float, cutlass::layout::RowMajor, float, - cutlass::arch::OpClassSimt, cutlass::arch::Sm50, - ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Gemm::ElementA* d_A, size_t lda, - const typename Gemm::ElementB* d_B, size_t ldb, - typename Gemm::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu new file mode 100644 index 00000000..796a1849 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 16>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu new file mode 100644 index 00000000..ed2bfa09 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 16>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu new file mode 100644 index 00000000..09c0c11f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 2>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu new file mode 100644 index 00000000..09c84e49 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 32>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu new file mode 100644 index 00000000..2be9035e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 4>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu new file mode 100644 index 00000000..1c751e12 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 4>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu new file mode 100644 index 00000000..ece1c4d7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu new file mode 100644 index 00000000..13031cfb --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu new file mode 100644 index 00000000..4dda125f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 128, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu new file mode 100644 index 00000000..d0653427 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 128>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu new file mode 100644 index 00000000..4cfc6d63 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 16>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu new file mode 100644 index 00000000..41b3a65d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 16>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu new file mode 100644 index 00000000..afe747f9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu new file mode 100644 index 00000000..f9b65472 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu new file mode 100644 index 00000000..7b0f8c1f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu new file mode 100644 index 00000000..7fd28ef7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 64>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu new file mode 100644 index 00000000..b61d3b55 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 64>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu new file mode 100644 index 00000000..b64ee3be --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu new file mode 100644 index 00000000..efc6f3dd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 4, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu new file mode 100644 index 00000000..38eef285 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 2, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu new file mode 100644 index 00000000..9db9b56a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu new file mode 100644 index 00000000..b3979fad --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 32>, + cutlass::gemm::GemmShape<1, 4, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu new file mode 100644 index 00000000..29f0f783 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 32>, + cutlass::gemm::GemmShape<1, 2, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu new file mode 100644 index 00000000..0814d571 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 4>, + cutlass::gemm::GemmShape<1, 1, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu new file mode 100644 index 00000000..53567cdf --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 64>, + cutlass::gemm::GemmShape<1, 4, 4>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu new file mode 100644 index 00000000..ab26e0fd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 8>, + cutlass::gemm::GemmShape<1, 2, 1>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu new file mode 100644 index 00000000..9321d8fc --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu @@ -0,0 +1,31 @@ + +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" + + + // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 + using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<1, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 2>, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor, + float, cutlass::layout::RowMajor + >; + + +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu deleted file mode 100644 index 95a8741c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu deleted file mode 100644 index a620831f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x16_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu deleted file mode 100644 index b3c0e76d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x2_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 2>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu deleted file mode 100644 index 0870613f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x32_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu deleted file mode 100644 index bcdfb0eb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu deleted file mode 100644 index af6d0e49..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x4_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu deleted file mode 100644 index 37bc33c8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu deleted file mode 100644 index 4ea842e8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu deleted file mode 100644 index bc916cfc..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x128x8_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 128, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu deleted file mode 100644 index 5ed9df14..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x128_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 128>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu deleted file mode 100644 index d38317f2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu deleted file mode 100644 index 7ebe415c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x16_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu deleted file mode 100644 index e7647be1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu deleted file mode 100644 index 2e0f0575..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu deleted file mode 100644 index c8252f5f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x32_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu deleted file mode 100644 index bc53eefb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu deleted file mode 100644 index 2c818beb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x64_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu deleted file mode 100644 index 4efb152b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x32x8_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 32, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu deleted file mode 100644 index 1ed408b3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x1x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu deleted file mode 100644 index ddf70bb8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x2x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu deleted file mode 100644 index f35e9ed3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x16_1x4x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 16>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu deleted file mode 100644 index 4b2e2fdf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x2x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu deleted file mode 100644 index c7771133..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x32_1x4x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 32>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu deleted file mode 100644 index 8ab75b3e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x4_1x1x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu deleted file mode 100644 index 2d281ce7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x64_1x4x4.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 64>; -using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu deleted file mode 100644 index eeab0c50..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x1x2.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 1, 2>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu deleted file mode 100644 index aef942ea..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/matrix_mul_fp32_simt_gemv_batched_strided_1x64x8_1x2x1.cu +++ /dev/null @@ -1,26 +0,0 @@ -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// generated by gen_cutlass_gemv_batched_strided_kern_impls.py -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/matrix_mul/fp32_simt_gemv/matrix_mul_float_simt_gemv_batched_strided_cutlass_wrapper.cuinl" - -using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 8>; -using ThreadShape = cutlass::gemm::GemmShape<1, 2, 1>; -using GemvKernel = cutlass::gemm::kernel::DefaultGemv< - ThreadBlockShape, - ThreadShape, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor>; -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename GemvKernel::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename GemvKernel::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename GemvKernel::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif