GitOrigin-RevId: 7882f9c68c
release-1.5
@@ -37,14 +37,13 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) | |||||
../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py | ../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py | ||||
./$^ --type cuda $@ | ./$^ --type cuda $@ | ||||
../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py cutlass_generator/generator.py | |||||
../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator | |||||
./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ | ./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ | ||||
./gen_cutlass_conv_bias_kern_impls.py --type dp4a $@ | |||||
python3 ./cutlass_generator/generator.py --operations all --type simt $@ | python3 ./cutlass_generator/generator.py --operations all --type simt $@ | ||||
../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py | |||||
../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator | |||||
./gen_cuda_conv_bias_kern_impls.py --type imma $@ | ./gen_cuda_conv_bias_kern_impls.py --type imma $@ | ||||
./gen_cutlass_conv_bias_kern_impls.py --type imma $@ | |||||
python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@ | |||||
../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py | ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py | ||||
./$^ --type dp4a $@ | ./$^ --type dp4a $@ | ||||
@@ -807,9 +807,9 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, | const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, | ||||
int* workspace, const convolution::ConvParam& param, | int* workspace, const convolution::ConvParam& param, | ||||
uint32_t nonlinear_mode, float alpha, float beta, float gamma, | uint32_t nonlinear_mode, float alpha, float beta, float gamma, | ||||
float delta, float theta, float scale, uint8_t src_zero_point, | |||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||||
cudaStream_t stream) { | |||||
float delta, float theta, float /* scale */, | |||||
uint8_t src_zero_point, const GemmCoord& threadblock_shape, | |||||
const GemmCoord& warp_shape, cudaStream_t stream) { | |||||
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_, warp_m_, warp_n_, \ | threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_) \ | warp_k_) \ | ||||
@@ -878,15 +878,6 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
0, delta, theta}; | 0, delta, theta}; | ||||
DISPATCH_KERNEL; | DISPATCH_KERNEL; | ||||
} | } | ||||
case NonlineMode::H_SWISH: { | |||||
using EpilogueOp = cutlass::epilogue::thread:: | |||||
BiasAddLinearCombinationHSwishClamp< | |||||
ElementOutput, 16, ElementAccumulator, ElementBias, | |||||
ElementCompute>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta, gamma, | |||||
scale, delta, theta}; | |||||
DISPATCH_KERNEL; | |||||
} | |||||
default: | default: | ||||
megdnn_assert(false, | megdnn_assert(false, | ||||
"unsupported nonlinear mode for conv bias operator"); | "unsupported nonlinear mode for conv bias operator"); | ||||
@@ -960,8 +951,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::conv::threadblock:: \ | cutlass::conv::threadblock:: \ | ||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | ConvolutionFpropNCxHWxThreadblockSwizzle, \ | ||||
stages_, 4, aligned_, true, \ | |||||
cutlass::arch::OpMultiplyAddSaturate>; \ | |||||
stages_, 4, aligned_, true, cutlass::arch::OpMultiplyAdd>; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | typename Convolution::ConvolutionParameter conv_param( \ | ||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | ||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | ||||
@@ -1,65 +0,0 @@ | |||||
/** | |||||
* \file | |||||
* dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "cutlass/convolution/device/convolution.h" | |||||
#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace cutlass_wrapper; | |||||
template <typename Convolution> | |||||
void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param) { | |||||
typename Convolution::TensorRefSrc tensor_src{ | |||||
const_cast<typename Convolution::ElementSrc*>(d_src), | |||||
Convolution::LayoutSrc::packed( | |||||
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||||
typename Convolution::TensorRefFilter tensor_filter{ | |||||
const_cast<typename Convolution::ElementFilter*>(d_filter), | |||||
Convolution::LayoutFilter::packed( | |||||
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||||
typename Convolution::TensorRefBias tensor_bias{ | |||||
const_cast<typename Convolution::ElementBias*>(d_bias), | |||||
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||||
typename Convolution::TensorRefDst tensor_z{ | |||||
const_cast<typename Convolution::ElementDst*>(d_z), | |||||
Convolution::LayoutDst::packed( | |||||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
typename Convolution::TensorRefDst tensor_dst{ | |||||
d_dst, | |||||
Convolution::LayoutDst::packed( | |||||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
typename Convolution::Arguments arguments{conv_param, | |||||
tensor_src.non_const_ref(), | |||||
tensor_filter.non_const_ref(), | |||||
tensor_bias.non_const_ref(), | |||||
tensor_z.non_const_ref(), | |||||
tensor_dst.non_const_ref(), | |||||
epilogue, | |||||
{}, | |||||
{}, | |||||
extra_param}; | |||||
Convolution conv_op; | |||||
cutlass_check(conv_op.initialize(arguments, workspace)); | |||||
cutlass_check(conv_op(stream)); | |||||
after_kernel_launch(); | |||||
} | |||||
// vim: syntax=cuda.doxygen |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int4_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 32, 32, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,55 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1 +0,0 @@ | |||||
../implicit_gemm_conv_bias_cutlass_wrapper.cuinl |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,36 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_int8_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |