@@ -3,6 +3,7 @@ | |||||
dnn/src/cuda/conv_bias/int8/kimpl/* binary | dnn/src/cuda/conv_bias/int8/kimpl/* binary | ||||
dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | ||||
dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | ||||
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | |||||
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | ||||
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | ||||
*.caffemodel filter=lfs diff=lfs merge=lfs -text | *.caffemodel filter=lfs diff=lfs merge=lfs -text | ||||
@@ -8,9 +8,10 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ | |||||
../src/naive/elemwise/kimpl \ | ../src/naive/elemwise/kimpl \ | ||||
../src/cuda/elemwise_multi_type/kimpl | ../src/cuda/elemwise_multi_type/kimpl | ||||
CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl | |||||
CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl | |||||
CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl | |||||
all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} | |||||
all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL) | |||||
../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py | ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py | ||||
./$^ $@ | ./$^ $@ | ||||
@@ -47,4 +48,7 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} | |||||
../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py | ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py | ||||
./$^ --type dp4a $@ | ./$^ --type dp4a $@ | ||||
../src/cuda/matrix_mul/fp32_simt/kimpl: gen_cutlass_matmul_kern_impls.py | |||||
./$^ $@ | |||||
.PHONY: all | .PHONY: all |
@@ -33,12 +33,37 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { | |||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
all_algos.push_back(&bfloat16); | all_algos.push_back(&bfloat16); | ||||
#endif | #endif | ||||
fill_cutlass_algos(); | |||||
for (auto&& algo : simt_float32) { | |||||
all_algos.push_back(&algo); | |||||
} | |||||
for (auto&& algo : all_algos) { | for (auto&& algo : all_algos) { | ||||
m_all_algos_map.emplace(algo->info().desc, algo); | m_all_algos_map.emplace(algo->info().desc, algo); | ||||
} | } | ||||
} | } | ||||
void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { | |||||
using AlgoParam = AlgoFloat32SIMT::AlgoParam; | |||||
simt_float32.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8}); | |||||
simt_float32.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); | |||||
simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); | |||||
simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); | |||||
} | |||||
MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; | MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; | ||||
MEGDNN_DEF_GET_ALGO_FROM_DESC(MatrixMulForwardImpl) | MEGDNN_DEF_GET_ALGO_FROM_DESC(MatrixMulForwardImpl) | ||||
@@ -41,7 +41,8 @@ public: | |||||
CUDA_WMMA_UINT4X4X32, | CUDA_WMMA_UINT4X4X32, | ||||
CUDA_CUBLASLT, | CUDA_CUBLASLT, | ||||
CUDA_NAIVE, | CUDA_NAIVE, | ||||
CUDA_BFLOAT16 | |||||
CUDA_BFLOAT16, | |||||
CUDA_FLOAT32_SIMT, | |||||
}; | }; | ||||
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
@@ -165,6 +166,38 @@ private: | |||||
}; | }; | ||||
#endif | #endif | ||||
class MatrixMulForwardImpl::AlgoFloat32SIMT final : public AlgoBase { | |||||
public: | |||||
struct AlgoParam { | |||||
int threadblock_m, threadblock_n, threadblock_k; | |||||
int warp_m, warp_n, warp_k; | |||||
std::string to_string() { | |||||
return ssprintf("%dX%dX%d_%dX%dX%d", threadblock_m, threadblock_n, | |||||
threadblock_k, warp_m, warp_n, warp_k); | |||||
} | |||||
}; | |||||
AlgoFloat32SIMT(AlgoParam algo_param) | |||||
: m_algo_param{algo_param}, | |||||
m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s", | |||||
m_algo_param.to_string().c_str())} {} | |||||
bool is_available(const SizeArgs& args) const override; | |||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
const char* name() const override { return m_name.c_str(); } | |||||
void exec(const ExecArgs& args) const override; | |||||
bool is_reproducible() const override { return true; } | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT) | |||||
std::string param() const override { | |||||
std::string ret; | |||||
serialize_write_pod(m_algo_param, ret); | |||||
return ret; | |||||
} | |||||
private: | |||||
AlgoParam m_algo_param; | |||||
std::string m_name; | |||||
}; | |||||
class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | ||||
private: | private: | ||||
AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
@@ -182,9 +215,11 @@ public: | |||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
#endif | #endif | ||||
std::vector<AlgoFloat32SIMT> simt_float32; | |||||
std::vector<AlgoBase*> all_algos; | std::vector<AlgoBase*> all_algos; | ||||
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | ||||
void fill_cutlass_algos(); | |||||
}; | }; | ||||
} // namespace cuda | } // namespace cuda | ||||
@@ -0,0 +1,73 @@ | |||||
/** | |||||
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/cuda/handle.h" | |||||
#include "src/cuda/matrix_mul/algos.h" | |||||
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | |||||
#include "src/cuda/utils.h" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace cutlass_wrapper; | |||||
bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( | |||||
const SizeArgs& args) const { | |||||
#if CUDA_VERSION >= 9200 | |||||
return args.opr->param().format == param::MatrixMul::Format::DEFAULT && | |||||
args.layout_a.dtype == dtype::Float32() && | |||||
args.layout_b.dtype == dtype::Float32() && | |||||
args.layout_c.dtype == dtype::Float32(); | |||||
#else | |||||
return false; | |||||
#endif | |||||
} | |||||
size_t MatrixMulForwardImpl::AlgoFloat32SIMT::get_workspace_in_bytes( | |||||
const SizeArgs& args) const { | |||||
size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0], | |||||
ldc = args.layout_c.stride[0]; | |||||
auto&& param = args.opr->param(); | |||||
int m = args.layout_c.shape[0], n = args.layout_c.shape[1], | |||||
k = args.layout_a.shape[param.transposeA ? 0 : 1]; | |||||
GemmCoord problem_size{m, n, k}; | |||||
return cutlass_matrix_mul_float32_simt_get_workspace_size( | |||||
param.transposeA, lda, param.transposeB, ldb, ldc, problem_size, | |||||
1.f, 0.f, | |||||
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
m_algo_param.threadblock_k}, | |||||
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, | |||||
m_algo_param.warp_k}); | |||||
} | |||||
void MatrixMulForwardImpl::AlgoFloat32SIMT::exec(const ExecArgs& args) const { | |||||
size_t lda = args.tensor_a.layout.stride[0], | |||||
ldb = args.tensor_b.layout.stride[0], | |||||
ldc = args.tensor_c.layout.stride[0]; | |||||
auto&& param = args.opr->param(); | |||||
int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], | |||||
k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; | |||||
GemmCoord problem_size{m, n, k}; | |||||
auto&& stream = cuda_stream(args.opr->handle()); | |||||
int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr); | |||||
return cutlass_matrix_mul_float32_simt( | |||||
args.tensor_a.ptr<dt_float32>(), param.transposeA, lda, | |||||
args.tensor_b.ptr<dt_float32>(), param.transposeB, ldb, | |||||
args.tensor_c.ptr<dt_float32>(), ldc, workspace, problem_size, 1.f, | |||||
0.f, | |||||
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
m_algo_param.threadblock_k}, | |||||
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, | |||||
m_algo_param.warp_k}, | |||||
stream); | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,200 @@ | |||||
/** | |||||
* \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "cuda.h" | |||||
#if __CUDACC_VER_MAJOR__ > 9 || \ | |||||
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
#include "cutlass/gemm/device/gemm.h" | |||||
#endif | |||||
#include "src/common/opr_param_defs_enumv.cuh" | |||||
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | |||||
#pragma GCC diagnostic pop | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace cutlass_wrapper; | |||||
/* ================= cutlass kernel wrapper for f32 matrix mul ================ | |||||
*/ | |||||
#define DISPATCH(cb) \ | |||||
cb(64, 256, 8, 32, 64, 8); \ | |||||
cb(256, 64, 8, 64, 32, 8); \ | |||||
cb(32, 256, 8, 16, 64, 8); \ | |||||
cb(256, 32, 8, 64, 16, 8); \ | |||||
cb(128, 128, 8, 32, 64, 8); \ | |||||
cb(128, 64, 8, 64, 32, 8); \ | |||||
cb(64, 128, 8, 32, 64, 8); \ | |||||
cb(128, 32, 8, 64, 32, 8); \ | |||||
cb(32, 128, 8, 32, 64, 8); \ | |||||
cb(64, 64, 8, 32, 64, 8); \ | |||||
cb(32, 64, 8, 32, 64, 8); \ | |||||
cb(64, 32, 8, 64, 32, 8); \ | |||||
cb(32, 32, 8, 32, 32, 8); \ | |||||
cb(8, 32, 8, 8, 32, 8); \ | |||||
cb(16, 32, 8, 16, 32, 8); \ | |||||
cb(16, 64, 8, 16, 64, 8); \ | |||||
cb(16, 128, 8, 16, 64, 8); \ | |||||
megdnn_assert(false, \ | |||||
"unsupported threadblock shape (%dx%dx%d) and warp shape " \ | |||||
"(%dx%dx%d)", \ | |||||
threadblock_shape.m(), threadblock_shape.n(), \ | |||||
threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ | |||||
warp_shape.k()); | |||||
#if __CUDACC_VER_MAJOR__ < 9 || \ | |||||
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ <= 2) | |||||
void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | |||||
const float* /* d_A */, bool /* transpose_A */, size_t /* lda */, | |||||
const float* /* d_B */, bool /* transpose_B */, size_t /* ldb */, | |||||
float* /* d_C */, size_t /* ldc */, int* /* workspace */, | |||||
GemmCoord const& /* problem_size */, float /* alpha */, | |||||
float /* beta */, const GemmCoord& /* threadblock_shape */, | |||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||||
#else | |||||
void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | |||||
const float* d_A, bool transpose_A, size_t lda, const float* d_B, | |||||
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | |||||
GemmCoord const& problem_size, float alpha, float beta, | |||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||||
cudaStream_t stream) { | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | |||||
warp_k_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | |||||
threadblock_shape.n() == threadblock_n_ && \ | |||||
threadblock_shape.k() == threadblock_k_ && \ | |||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||||
warp_shape.k() == warp_k_) { \ | |||||
using ThreadBlockShape = \ | |||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||||
threadblock_k_>; \ | |||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ | |||||
using Gemm = cutlass::gemm::device::Gemm< \ | |||||
float, LayoutA, float, LayoutB, float, \ | |||||
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ | |||||
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ | |||||
InstructionShape, EpilogueOp, \ | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, \ | |||||
2>; \ | |||||
return cutlass_matrix_mul_wrapper<Gemm>(d_A, lda, d_B, ldb, d_C, ldc, \ | |||||
workspace, problem_size, \ | |||||
epilogue, stream); \ | |||||
} | |||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | |||||
} | |||||
#endif | |||||
#if __CUDACC_VER_MAJOR__ < 9 || \ | |||||
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ <= 2) | |||||
size_t megdnn::cuda::cutlass_wrapper:: | |||||
cutlass_matrix_mul_float32_simt_get_workspace_size( | |||||
bool /* transpose_A */, size_t /* lda */, | |||||
bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, | |||||
GemmCoord const& /* problem_size */, float /* alpha */, | |||||
float /* beta */, const GemmCoord& /* threadblock_shape */, | |||||
const GemmCoord& /* warp_shape */) { | |||||
return 0; | |||||
} | |||||
#else | |||||
size_t megdnn::cuda::cutlass_wrapper:: | |||||
cutlass_matrix_mul_float32_simt_get_workspace_size( | |||||
bool transpose_A, size_t lda, bool transpose_B, size_t ldb, | |||||
size_t ldc, GemmCoord const& problem_size, float alpha, | |||||
float beta, const GemmCoord& threadblock_shape, | |||||
const GemmCoord& warp_shape) { | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | |||||
warp_k_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | |||||
threadblock_shape.n() == threadblock_n_ && \ | |||||
threadblock_shape.k() == threadblock_k_ && \ | |||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||||
warp_shape.k() == warp_k_) { \ | |||||
using ThreadBlockShape = \ | |||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||||
threadblock_k_>; \ | |||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ | |||||
using Gemm = cutlass::gemm::device::Gemm< \ | |||||
float, LayoutA, float, LayoutB, float, \ | |||||
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ | |||||
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ | |||||
InstructionShape, EpilogueOp, \ | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, \ | |||||
2>; \ | |||||
typename Gemm::TensorRefA tensor_A{ \ | |||||
nullptr, Gemm::LayoutA{static_cast<int>(lda)}}; \ | |||||
typename Gemm::TensorRefB tensor_B{ \ | |||||
nullptr, Gemm::LayoutB{static_cast<int>(ldb)}}; \ | |||||
typename Gemm::TensorRefC tensor_C{ \ | |||||
nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \ | |||||
typename Gemm::TensorRefD tensor_D{ \ | |||||
nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \ | |||||
typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \ | |||||
tensor_C, tensor_D, epilogue, \ | |||||
split_k_slices}; \ | |||||
return Gemm::get_workspace_size(arguments); \ | |||||
} | |||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
static constexpr int split_k_slices = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | |||||
} | |||||
#endif | |||||
#undef DISPATCH | |||||
// vim: syntax=cuda.doxygen |
@@ -0,0 +1,47 @@ | |||||
/** | |||||
* \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "cutlass/gemm/gemm.h" | |||||
#include "src/cuda/utils.cuh" | |||||
namespace megdnn { | |||||
namespace cuda { | |||||
namespace cutlass_wrapper { | |||||
using GemmCoord = cutlass::gemm::GemmCoord; | |||||
template <typename Gemm> | |||||
void cutlass_matrix_mul_wrapper( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, int* workspace, | |||||
GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
void cutlass_matrix_mul_float32_simt( | |||||
const float* d_A, bool transpose_A, size_t lda, const float* d_B, | |||||
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | |||||
GemmCoord const& problem_size, float alpha, float beta, | |||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||||
cudaStream_t stream); | |||||
size_t cutlass_matrix_mul_float32_simt_get_workspace_size( | |||||
bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, | |||||
GemmCoord const& problem_size, float alpha, float beta, | |||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape); | |||||
} // namespace cutlass_wrapper | |||||
} // namespace cuda | |||||
} // namespace megdnn | |||||
// vim: syntax=cuda.doxygen |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::Gemm< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, | |||||
2>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,52 @@ | |||||
/** | |||||
* \file | |||||
* dnn/src/cuda/matrix_mul/matrix_mul_float_simt_cutlass_wrapper.cuinl | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "cutlass/gemm/device/gemm.h" | |||||
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace cutlass_wrapper; | |||||
template <typename Gemm> | |||||
void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, int* workspace, | |||||
GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream) { | |||||
typename Gemm::TensorRefA tensor_a{ | |||||
const_cast<typename Gemm::ElementA*>(d_A), | |||||
typename Gemm::LayoutA{static_cast<int>(lda)}}; | |||||
typename Gemm::TensorRefB tensor_b{ | |||||
const_cast<typename Gemm::ElementB*>(d_B), | |||||
typename Gemm::LayoutB{static_cast<int>(ldb)}}; | |||||
typename Gemm::TensorRefC tensor_c{ | |||||
nullptr, typename Gemm::LayoutC{static_cast<int>(ldc)}}; | |||||
typename Gemm::TensorRefD tensor_d{ | |||||
d_C, typename Gemm::LayoutC{static_cast<int>(ldc)}}; | |||||
typename Gemm::Arguments arguments{problem_size, | |||||
tensor_a, | |||||
tensor_b, | |||||
tensor_c, | |||||
tensor_d.non_const_ref(), | |||||
epilogue, | |||||
1}; | |||||
Gemm gemm_op; | |||||
cutlass_check(gemm_op.initialize(arguments, workspace)); | |||||
cutlass_check(gemm_op(stream)); | |||||
after_kernel_launch(); | |||||
} | |||||
// vim: syntax=cuda.doxygen |
@@ -41,6 +41,7 @@ public: | |||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
class AlgoBFloat16; | class AlgoBFloat16; | ||||
#endif | #endif | ||||
class AlgoFloat32SIMT; | |||||
class AlgoPack; | class AlgoPack; | ||||
static const AlgoPack& algo_pack() { | static const AlgoPack& algo_pack() { | ||||
@@ -0,0 +1,337 @@ | |||||
/** | |||||
* \file dnn/test/cuda/cutlass_matmul.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include <cuda.h> | |||||
#include "megdnn/oprs/linalg.h" | |||||
#include "src/common/utils.h" | |||||
#include "test/common/checker.h" | |||||
#include "test/common/matrix_mul.h" | |||||
#include "test/common/tensor.h" | |||||
#include "test/common/workspace_wrapper.h" | |||||
#include "test/cuda/benchmark.h" | |||||
#include "test/cuda/fixture.h" | |||||
#include "test/cuda/utils.h" | |||||
#if CUDA_VERSION >= 9020 | |||||
namespace megdnn { | |||||
namespace test { | |||||
namespace { | |||||
void test_multibatchsize( | |||||
Handle* handle_cuda, DType A_dtype, DType B_dtype, DType C_dtype, | |||||
const char* algo, const std::vector<matrix_mul::TestArg>& args, | |||||
param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT, | |||||
const std::function<bool(const matrix_mul::TestArg&)>& filter = {}) { | |||||
Checker<MatrixMulForward> checker(handle_cuda, false); | |||||
if (algo) { | |||||
checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>(algo)); | |||||
} | |||||
std::unique_ptr<RNG> rng; | |||||
if (A_dtype.enumv() == DTypeEnum::Float32) { | |||||
rng = std::make_unique<UniformFloatRNG>(-1, 1); | |||||
megdnn_assert(B_dtype.enumv() == DTypeEnum::Float32 && | |||||
C_dtype.enumv() == DTypeEnum::Float32); | |||||
} | |||||
megdnn_assert(rng != nullptr); | |||||
struct Compare { | |||||
bool is_same(dt_float32 expected, dt_float32 actual) const { | |||||
return expected == actual; | |||||
} | |||||
}; | |||||
// copy rhs->lhs, lhs is 8 times of rhs | |||||
auto copy = [](SyncedTensor<dt_float32, Compare>& lhs, | |||||
SyncedTensor<dt_float32, Compare>& rhs) { | |||||
size_t chunk = rhs.layout().span().dist_byte(); | |||||
size_t tot = lhs.layout().span().dist_byte(); | |||||
megdnn_assert(tot % chunk == 0); | |||||
char* pointer_lhs = reinterpret_cast<char*>(lhs.ptr_mutable_host()); | |||||
const char* pointer_rhs = reinterpret_cast<const char*>(rhs.ptr_host()); | |||||
for (size_t i = 0; i < tot; i += chunk) { | |||||
std::memcpy(pointer_lhs + i, pointer_rhs, chunk); | |||||
} | |||||
}; | |||||
using Param = param::MatrixMul; | |||||
megdnn_assert(format == Param::Format::DEFAULT); | |||||
for (auto&& arg : args) { | |||||
megdnn_assert(arg.mask == 0x0); | |||||
// make m, n, k big enough | |||||
size_t m = arg.m, n = (arg.n << 3), k = (arg.k << 3); | |||||
size_t m_prime = (m << 3); | |||||
if (filter && filter(arg)) | |||||
continue; | |||||
TensorShape A{m, k}, B{k, n}, C{m, n}; | |||||
TensorShape A_prime{m_prime, k}, C_prime{m_prime, n}; | |||||
SyncedTensor<dt_float32, Compare> A_tensor{handle_cuda, {A, A_dtype}}, | |||||
B_tensor{handle_cuda, {B, B_dtype}}, | |||||
C_tensor{handle_cuda, {C, C_dtype}}, | |||||
A_tensor_prime{handle_cuda, {A_prime, A_dtype}}, | |||||
C_tensor_prime{handle_cuda, {C_prime, C_dtype}}, | |||||
C_tensor_batch{handle_cuda, {C_prime, C_dtype}}; | |||||
rng->gen(A_tensor.tensornd_host()); | |||||
rng->gen(B_tensor.tensornd_host()); | |||||
copy(A_tensor_prime, A_tensor); | |||||
auto opr_reference = handle_cuda->create_operator<MatrixMulForward>(); | |||||
{ | |||||
opr_reference->execution_policy().algo.reset(); | |||||
for (auto i : opr_reference->get_all_algorithms_info( | |||||
A_tensor.layout(), B_tensor.layout(), | |||||
C_tensor.layout())) { | |||||
if (std::regex_match( | |||||
i.name.c_str(), | |||||
std::regex("(" + std::string(algo) + ")(.*)"))) { | |||||
opr_reference->execution_policy().algo = i; | |||||
break; | |||||
} | |||||
} | |||||
megdnn_assert(opr_reference->execution_policy().algo.valid()); | |||||
size_t ws_size = opr_reference->get_workspace_in_bytes( | |||||
A_tensor.layout(), B_tensor.layout(), C_tensor.layout()); | |||||
WorkspaceWrapper ws_reference(handle_cuda, ws_size); | |||||
opr_reference->exec( | |||||
A_tensor.tensornd_dev(), B_tensor.tensornd_dev(), | |||||
C_tensor.tensornd_dev(), ws_reference.workspace()); | |||||
} | |||||
copy(C_tensor_prime, C_tensor); | |||||
checker.set_dtype(0, A_dtype) | |||||
.set_dtype(1, B_dtype) | |||||
.set_dtype(2, C_dtype) | |||||
.set_epsilon(1e-6) | |||||
.exect({A_tensor_prime.tensornd_host(), | |||||
B_tensor.tensornd_host(), | |||||
{}}, | |||||
{{}, {}, C_tensor_prime.tensornd_host()}); | |||||
{ | |||||
opr_reference->execution_policy().algo.reset(); | |||||
for (auto i : opr_reference->get_all_algorithms_info( | |||||
A_tensor_prime.layout(), B_tensor.layout(), | |||||
C_tensor_batch.layout())) { | |||||
if (std::regex_match( | |||||
i.name.c_str(), | |||||
std::regex("(" + std::string(algo) + ")(.*)"))) { | |||||
opr_reference->execution_policy().algo = i; | |||||
break; | |||||
} | |||||
} | |||||
megdnn_assert(opr_reference->execution_policy().algo.valid()); | |||||
size_t ws_size = opr_reference->get_workspace_in_bytes( | |||||
A_tensor_prime.layout(), B_tensor.layout(), | |||||
C_tensor_batch.layout()); | |||||
WorkspaceWrapper ws_reference(handle_cuda, ws_size); | |||||
opr_reference->exec( | |||||
A_tensor_prime.tensornd_dev(), B_tensor.tensornd_dev(), | |||||
C_tensor_batch.tensornd_dev(), ws_reference.workspace()); | |||||
} | |||||
C_tensor_batch.check_with(C_tensor_prime); | |||||
} | |||||
} | |||||
#if MEGDNN_WITH_BENCHMARK | |||||
struct BenchArgs { | |||||
size_t m, n, k, mask = 0x0; | |||||
}; | |||||
std::vector<BenchArgs> get_square_matmul_args() { | |||||
std::vector<BenchArgs> args; | |||||
args.emplace_back(BenchArgs{128, 128, 128}); | |||||
args.emplace_back(BenchArgs{256, 256, 256}); | |||||
args.emplace_back(BenchArgs{512, 512, 512}); | |||||
args.emplace_back(BenchArgs{1024, 1024, 1024}); | |||||
args.emplace_back(BenchArgs{2048, 2048, 2048}); | |||||
args.emplace_back(BenchArgs{4096, 4096, 4096}); | |||||
return args; | |||||
} | |||||
std::vector<BenchArgs> get_feat_model_args() { | |||||
std::vector<BenchArgs> args; | |||||
args.emplace_back(BenchArgs{2, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{2, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{2, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{2, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{1, 256, 8192}); | |||||
args.emplace_back(BenchArgs{2, 864, 864}); | |||||
args.emplace_back(BenchArgs{2, 9, 64}); | |||||
args.emplace_back(BenchArgs{4, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{4, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{4, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{4, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{2, 256, 8192}); | |||||
args.emplace_back(BenchArgs{4, 864, 864}); | |||||
args.emplace_back(BenchArgs{4, 9, 64}); | |||||
args.emplace_back(BenchArgs{8, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{8, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{8, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{8, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{4, 256, 8192}); | |||||
args.emplace_back(BenchArgs{8, 864, 864}); | |||||
args.emplace_back(BenchArgs{4, 9, 64}); | |||||
args.emplace_back(BenchArgs{16, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{16, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{16, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{16, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{8, 256, 8192}); | |||||
args.emplace_back(BenchArgs{16, 864, 864}); | |||||
args.emplace_back(BenchArgs{8, 9, 64}); | |||||
args.emplace_back(BenchArgs{32, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{32, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{32, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{32, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{16, 256, 8192}); | |||||
args.emplace_back(BenchArgs{32, 864, 864}); | |||||
args.emplace_back(BenchArgs{32, 9, 64}); | |||||
args.emplace_back(BenchArgs{64, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{64, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{64, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{64, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{32, 256, 8192}); | |||||
args.emplace_back(BenchArgs{64, 864, 864}); | |||||
args.emplace_back(BenchArgs{64, 9, 64}); | |||||
args.emplace_back(BenchArgs{128, 4096, 4096}); | |||||
args.emplace_back(BenchArgs{128, 1024, 6912}); | |||||
args.emplace_back(BenchArgs{128, 3456, 3456}); | |||||
args.emplace_back(BenchArgs{128, 2304, 2304}); | |||||
args.emplace_back(BenchArgs{64, 256, 8192}); | |||||
args.emplace_back(BenchArgs{128, 864, 864}); | |||||
args.emplace_back(BenchArgs{128, 9, 64}); | |||||
return args; | |||||
} | |||||
void benchmark_matrix_mul( | |||||
Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype, | |||||
DType B_dtype, DType C_dtype, const char* algo = nullptr, | |||||
param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT) { | |||||
megdnn_assert(A_dtype.enumv() == B_dtype.enumv()); | |||||
CUBenchmarker<MatrixMulForward> benchmarker(handle); | |||||
CUBenchmarker<MatrixMulForward> benchmarker_cublas(handle); | |||||
size_t RUNS = 1000; | |||||
benchmarker.set_display(false).set_times(RUNS); | |||||
benchmarker_cublas.set_display(false).set_times(RUNS); | |||||
benchmarker_cublas.set_before_exec_callback( | |||||
AlgoChecker<MatrixMulForward>("CUBLAS")); | |||||
benchmarker.set_dtype(0, A_dtype) | |||||
.set_dtype(1, B_dtype) | |||||
.set_dtype(2, C_dtype); | |||||
benchmarker_cublas.set_dtype(0, A_dtype) | |||||
.set_dtype(1, B_dtype) | |||||
.set_dtype(2, C_dtype); | |||||
using Param = MatrixMul::Param; | |||||
for (auto&& arg : args) { | |||||
size_t m = arg.m, n = arg.n, k = arg.k; | |||||
Param param; | |||||
param.transposeA = arg.mask & 0x1; | |||||
param.transposeB = arg.mask & 0x2; | |||||
param.format = format; | |||||
size_t A0 = m, A1 = k, B0 = k, B1 = n; | |||||
if (param.transposeA) { | |||||
std::swap(A0, A1); | |||||
} | |||||
if (param.transposeB) { | |||||
std::swap(B0, B1); | |||||
} | |||||
benchmarker.set_param(param); | |||||
TensorShape A{A0, A1}, B{B0, B1}, C{m, n}; | |||||
float time_in_ms = 0.f; | |||||
if (algo) { | |||||
time_in_ms = | |||||
algo_benchmark<MatrixMulForward, OprProxy<MatrixMulForward>, | |||||
CUTimer>(benchmarker, {A, B, C}, algo) / | |||||
RUNS; | |||||
} else { | |||||
time_in_ms = benchmarker.execs({A, B, C}) / RUNS; | |||||
} | |||||
benchmarker_cublas.set_param(param); | |||||
auto time_in_ms_cublas = benchmarker_cublas.execs({A, B, C}) / RUNS; | |||||
float flo = 2.0 * m * n * k / (1e12); | |||||
printf("A=%s, B=%s, C=%s, time(algo=%s)=%.2f %.2fTops, " | |||||
"time(cublas)=%.2f %.2fTops, " | |||||
"perf(algo=%s)/perf(cublas)=%.2f\n", | |||||
A.to_string().c_str(), B.to_string().c_str(), | |||||
C.to_string().c_str(), algo, time_in_ms, | |||||
(flo / (time_in_ms * 1e-3)), time_in_ms_cublas, | |||||
(flo / (time_in_ms_cublas * 1e-3)), algo, | |||||
time_in_ms_cublas / time_in_ms); | |||||
} | |||||
} | |||||
#endif | |||||
} // namespace | |||||
TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { | |||||
auto args = matrix_mul::get_matmul_args_no_mask(); | |||||
test_multibatchsize(handle_cuda(), dtype::Float32(), dtype::Float32(), | |||||
dtype::Float32(), | |||||
"CUTLASS_FLOAT32_SIMT_128X128X8_32X64X8", args, | |||||
param::MatrixMul::Format::DEFAULT); | |||||
} | |||||
#define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | |||||
cb(1, 64, 256, 8, 32, 64, 8); \ | |||||
cb(2, 256, 64, 8, 64, 32, 8); \ | |||||
cb(3, 32, 256, 8, 16, 64, 8); \ | |||||
cb(4, 256, 32, 8, 64, 16, 8); \ | |||||
cb(5, 128, 128, 8, 32, 64, 8); \ | |||||
cb(6, 128, 64, 8, 64, 32, 8); \ | |||||
cb(7, 64, 128, 8, 32, 64, 8); \ | |||||
cb(8, 128, 32, 8, 64, 32, 8); \ | |||||
cb(9, 32, 128, 8, 32, 64, 8); \ | |||||
cb(10, 64, 64, 8, 32, 64, 8); \ | |||||
cb(11, 32, 64, 8, 32, 64, 8); \ | |||||
cb(12, 64, 32, 8, 64, 32, 8); \ | |||||
cb(13, 32, 32, 8, 32, 32, 8); \ | |||||
cb(14, 8, 32, 8, 8, 32, 8); \ | |||||
cb(15, 16, 32, 8, 16, 32, 8); \ | |||||
cb(16, 16, 64, 8, 16, 64, 8); \ | |||||
cb(17, 16, 128, 8, 16, 64, 8); | |||||
#define cb(name, tbm, tbn, tbk, wm, wn, wk) \ | |||||
TEST_F(CUDA, CUTLASS_GEMM_##name) { \ | |||||
matrix_mul::check_matrix_mul<MatrixMulForward>( \ | |||||
dtype::Float32(), dtype::Float32(), dtype::Float32(), \ | |||||
handle_cuda(), \ | |||||
"CUTLASS_FLOAT32_SIMT_" #tbm "X" #tbn "X" #tbk "_" #wm "X" #wn \ | |||||
"X" #wk); \ | |||||
} | |||||
MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
#undef cb | |||||
#undef MEGDNN_FOREACH_CUTLASS_KERNEL | |||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { | |||||
benchmark_matrix_mul(handle_cuda(), get_square_matmul_args(), | |||||
dtype::Float32(), dtype::Float32(), dtype::Float32(), | |||||
"CUTLASS_FLOAT32_SIMT"); | |||||
} | |||||
TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { | |||||
benchmark_matrix_mul(handle_cuda(), get_feat_model_args(), dtype::Float32(), | |||||
dtype::Float32(), dtype::Float32(), | |||||
"CUTLASS_FLOAT32_SIMT"); | |||||
} | |||||
#endif | |||||
} // namespace test | |||||
} // namespace megdnn | |||||
#endif | |||||
// vim: syntax=cpp.doxygen |