GitOrigin-RevId: 650209e35f
tags/v1.3.0
@@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ | |||||
../src/cuda/elemwise_multi_type/kimpl | ../src/cuda/elemwise_multi_type/kimpl | ||||
CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl | CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl | ||||
CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl | |||||
CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl | |||||
all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL) | |||||
all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) | |||||
../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py | ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py | ||||
./$^ $@ | ./$^ $@ | ||||
@@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { | |||||
for (auto&& algo : simt_float32) { | for (auto&& algo : simt_float32) { | ||||
all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
} | } | ||||
for (auto&& algo : simt_float32_split_k) { | |||||
all_algos.push_back(&algo); | |||||
} | |||||
for (auto&& algo : all_algos) { | for (auto&& algo : all_algos) { | ||||
m_all_algos_map.emplace(algo->info().desc, algo); | m_all_algos_map.emplace(algo->info().desc, algo); | ||||
@@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { | |||||
simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); | simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); | ||||
simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); | simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); | ||||
simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); | simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); | ||||
simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); | |||||
simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); | |||||
} | } | ||||
MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; | MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; | ||||
@@ -43,6 +43,7 @@ public: | |||||
CUDA_NAIVE, | CUDA_NAIVE, | ||||
CUDA_BFLOAT16, | CUDA_BFLOAT16, | ||||
CUDA_FLOAT32_SIMT, | CUDA_FLOAT32_SIMT, | ||||
CUDA_FLOAT32_SIMT_SPLIT_K, | |||||
}; | }; | ||||
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
@@ -198,6 +199,31 @@ private: | |||||
std::string m_name; | std::string m_name; | ||||
}; | }; | ||||
class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase { | |||||
public: | |||||
using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam; | |||||
AlgoFloat32SIMTSplitK(AlgoParam algo_param) | |||||
: m_algo_param{algo_param}, | |||||
m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", | |||||
m_algo_param.to_string().c_str())} {} | |||||
bool is_available(const SizeArgs& args) const override; | |||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
const char* name() const override { return m_name.c_str(); } | |||||
void exec(const ExecArgs& args) const override; | |||||
bool is_reproducible() const override { return true; } | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) | |||||
std::string param() const override { | |||||
std::string ret; | |||||
serialize_write_pod(m_algo_param, ret); | |||||
return ret; | |||||
} | |||||
private: | |||||
AlgoParam m_algo_param; | |||||
std::string m_name; | |||||
}; | |||||
class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | ||||
private: | private: | ||||
AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
@@ -216,6 +242,7 @@ public: | |||||
AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
#endif | #endif | ||||
std::vector<AlgoFloat32SIMT> simt_float32; | std::vector<AlgoFloat32SIMT> simt_float32; | ||||
std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | |||||
std::vector<AlgoBase*> all_algos; | std::vector<AlgoBase*> all_algos; | ||||
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | ||||
@@ -0,0 +1,76 @@ | |||||
/** | |||||
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/cuda/handle.h" | |||||
#include "src/cuda/matrix_mul/algos.h" | |||||
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | |||||
#include "src/cuda/utils.h" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace cutlass_wrapper; | |||||
bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | |||||
const SizeArgs& args) const { | |||||
auto&& param = args.opr->param(); | |||||
int m = args.layout_c.shape[0], n = args.layout_c.shape[1], | |||||
k = args.layout_a.shape[param.transposeA ? 0 : 1]; | |||||
return args.opr->param().format == param::MatrixMul::Format::DEFAULT && | |||||
args.layout_a.dtype == dtype::Float32() && | |||||
args.layout_b.dtype == dtype::Float32() && | |||||
args.layout_c.dtype == dtype::Float32() && k > std::max(m, n); | |||||
} | |||||
size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes( | |||||
const SizeArgs& args) const { | |||||
size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0], | |||||
ldc = args.layout_c.stride[0]; | |||||
auto&& param = args.opr->param(); | |||||
int m = args.layout_c.shape[0], n = args.layout_c.shape[1], | |||||
k = args.layout_a.shape[param.transposeA ? 0 : 1]; | |||||
GemmCoord problem_size{m, n, k}; | |||||
int split_k_slices = k / std::max(m, n); | |||||
return cutlass_matrix_mul_float32_simt_get_workspace_size( | |||||
param.transposeA, lda, param.transposeB, ldb, ldc, problem_size, | |||||
1.f, 0.f, | |||||
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
m_algo_param.threadblock_k}, | |||||
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, | |||||
m_algo_param.warp_k}, | |||||
split_k_slices); | |||||
} | |||||
void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec( | |||||
const ExecArgs& args) const { | |||||
size_t lda = args.tensor_a.layout.stride[0], | |||||
ldb = args.tensor_b.layout.stride[0], | |||||
ldc = args.tensor_c.layout.stride[0]; | |||||
auto&& param = args.opr->param(); | |||||
int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], | |||||
k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; | |||||
GemmCoord problem_size{m, n, k}; | |||||
int split_k_slices = k / std::max(m, n); | |||||
auto&& stream = cuda_stream(args.opr->handle()); | |||||
int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr); | |||||
return cutlass_matrix_mul_float32_simt( | |||||
args.tensor_a.ptr<dt_float32>(), param.transposeA, lda, | |||||
args.tensor_b.ptr<dt_float32>(), param.transposeB, ldb, | |||||
args.tensor_c.ptr<dt_float32>(), ldc, workspace, problem_size, 1.f, | |||||
0.f, | |||||
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
m_algo_param.threadblock_k}, | |||||
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, | |||||
m_algo_param.warp_k}, | |||||
stream, split_k_slices); | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -18,6 +18,7 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || \ | #if __CUDACC_VER_MAJOR__ > 9 || \ | ||||
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | ||||
#include "cutlass/gemm/device/gemm.h" | #include "cutlass/gemm/device/gemm.h" | ||||
#include "cutlass/gemm/device/gemm_splitk_parallel.h" | |||||
#endif | #endif | ||||
#include "src/common/opr_param_defs_enumv.cuh" | #include "src/common/opr_param_defs_enumv.cuh" | ||||
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | ||||
@@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | |||||
float* /* d_C */, size_t /* ldc */, int* /* workspace */, | float* /* d_C */, size_t /* ldc */, int* /* workspace */, | ||||
GemmCoord const& /* problem_size */, float /* alpha */, | GemmCoord const& /* problem_size */, float /* alpha */, | ||||
float /* beta */, const GemmCoord& /* threadblock_shape */, | float /* beta */, const GemmCoord& /* threadblock_shape */, | ||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */, | |||||
int /* split_k_slices */) {} | |||||
#else | #else | ||||
void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | ||||
const float* d_A, bool transpose_A, size_t lda, const float* d_B, | const float* d_A, bool transpose_A, size_t lda, const float* d_B, | ||||
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | ||||
GemmCoord const& problem_size, float alpha, float beta, | GemmCoord const& problem_size, float alpha, float beta, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | ||||
cudaStream_t stream) { | |||||
cudaStream_t stream, int split_k_slices) { | |||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (split_k_slices == 1) { | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_) \ | warp_k_) \ | ||||
if (threadblock_shape.m() == threadblock_m_ && \ | if (threadblock_shape.m() == threadblock_m_ && \ | ||||
@@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( | |||||
workspace, problem_size, \ | workspace, problem_size, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
} | } | ||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | |||||
} else { | } else { | ||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | |||||
warp_k_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | |||||
threadblock_shape.n() == threadblock_n_ && \ | |||||
threadblock_shape.k() == threadblock_k_ && \ | |||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||||
warp_shape.k() == warp_k_) { \ | |||||
using ThreadBlockShape = \ | |||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||||
threadblock_k_>; \ | |||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ | |||||
float, LayoutA, float, LayoutB, float, \ | |||||
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ | |||||
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ | |||||
InstructionShape, EpilogueOp>; \ | |||||
return cutlass_matrix_mul_wrapper<Gemm>( \ | |||||
d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size, \ | |||||
epilogue, stream, split_k_slices); \ | |||||
} | } | ||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | #undef cb | ||||
} | |||||
} | } | ||||
#endif | #endif | ||||
@@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper:: | |||||
bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, | bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, | ||||
GemmCoord const& /* problem_size */, float /* alpha */, | GemmCoord const& /* problem_size */, float /* alpha */, | ||||
float /* beta */, const GemmCoord& /* threadblock_shape */, | float /* beta */, const GemmCoord& /* threadblock_shape */, | ||||
const GemmCoord& /* warp_shape */) { | |||||
const GemmCoord& /* warp_shape */, int /* split_k_slices */) { | |||||
return 0; | return 0; | ||||
} | } | ||||
#else | #else | ||||
@@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper:: | |||||
bool transpose_A, size_t lda, bool transpose_B, size_t ldb, | bool transpose_A, size_t lda, bool transpose_B, size_t ldb, | ||||
size_t ldc, GemmCoord const& problem_size, float alpha, | size_t ldc, GemmCoord const& problem_size, float alpha, | ||||
float beta, const GemmCoord& threadblock_shape, | float beta, const GemmCoord& threadblock_shape, | ||||
const GemmCoord& warp_shape) { | |||||
const GemmCoord& warp_shape, int split_k_slices) { | |||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (split_k_slices == 1) { | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_) \ | warp_k_) \ | ||||
if (threadblock_shape.m() == threadblock_m_ && \ | if (threadblock_shape.m() == threadblock_m_ && \ | ||||
@@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper:: | |||||
split_k_slices}; \ | split_k_slices}; \ | ||||
return Gemm::get_workspace_size(arguments); \ | return Gemm::get_workspace_size(arguments); \ | ||||
} | } | ||||
static constexpr int kEpilogueElementsPerAccess = 1; | |||||
static constexpr int split_k_slices = 1; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination< | |||||
float, kEpilogueElementsPerAccess, float, float>; | |||||
typename EpilogueOp::Params epilogue{alpha, beta}; | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | |||||
} else { | } else { | ||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ | |||||
warp_k_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | |||||
threadblock_shape.n() == threadblock_n_ && \ | |||||
threadblock_shape.k() == threadblock_k_ && \ | |||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||||
warp_shape.k() == warp_k_) { \ | |||||
using ThreadBlockShape = \ | |||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||||
threadblock_k_>; \ | |||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ | |||||
float, LayoutA, float, LayoutB, float, \ | |||||
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ | |||||
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ | |||||
InstructionShape, EpilogueOp>; \ | |||||
using TensorRefA = cutlass::TensorRef<typename Gemm::ElementA const, \ | |||||
typename Gemm::LayoutA>; \ | |||||
using TensorRefB = cutlass::TensorRef<typename Gemm::ElementB const, \ | |||||
typename Gemm::LayoutB>; \ | |||||
using TensorRefC = cutlass::TensorRef<typename Gemm::ElementC const, \ | |||||
typename Gemm::LayoutC>; \ | |||||
using TensorRefD = cutlass::TensorRef<typename Gemm::ElementC, \ | |||||
typename Gemm::LayoutC>; \ | |||||
TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast<int>(lda)}}; \ | |||||
TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast<int>(ldb)}}; \ | |||||
TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \ | |||||
TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \ | |||||
typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \ | |||||
tensor_C, tensor_D, epilogue, \ | |||||
split_k_slices}; \ | |||||
return Gemm::get_workspace_size(arguments); \ | |||||
} | } | ||||
if (!transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else if (!transpose_A && transpose_B) { | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} else if (transpose_A && !transpose_B) { | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
DISPATCH(cb) | |||||
} else { | |||||
megdnn_assert(transpose_A && transpose_B); | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
DISPATCH(cb) | |||||
} | |||||
#undef cb | #undef cb | ||||
} | |||||
} | } | ||||
#endif | #endif | ||||
@@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper( | |||||
typename Gemm::ElementC* d_C, size_t ldc, int* workspace, | typename Gemm::ElementC* d_C, size_t ldc, int* workspace, | ||||
GemmCoord const& problem_size, | GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices = 1); | |||||
void cutlass_matrix_mul_float32_simt( | void cutlass_matrix_mul_float32_simt( | ||||
const float* d_A, bool transpose_A, size_t lda, const float* d_B, | const float* d_A, bool transpose_A, size_t lda, const float* d_B, | ||||
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, | ||||
GemmCoord const& problem_size, float alpha, float beta, | GemmCoord const& problem_size, float alpha, float beta, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices = 1); | |||||
size_t cutlass_matrix_mul_float32_simt_get_workspace_size( | size_t cutlass_matrix_mul_float32_simt_get_workspace_size( | ||||
bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, | bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, | ||||
GemmCoord const& problem_size, float alpha, float beta, | GemmCoord const& problem_size, float alpha, float beta, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape); | |||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1); | |||||
} // namespace cutlass_wrapper | } // namespace cutlass_wrapper | ||||
} // namespace cuda | } // namespace cuda | ||||
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::RowMajor; | using LayoutA = cutlass::layout::RowMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::RowMajor; | |||||
using LayoutB = cutlass::layout::ColumnMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -5,6 +5,7 @@ | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | ||||
using LayoutA = cutlass::layout::ColumnMajor; | using LayoutA = cutlass::layout::ColumnMajor; | ||||
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
int* workspace, | int* workspace, | ||||
cutlass::gemm::GemmCoord const& problem_size, | cutlass::gemm::GemmCoord const& problem_size, | ||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | typename Gemm::EpilogueOutputOp::Params const& epilogue, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | #pragma GCC diagnostic pop | ||||
#endif | #endif |
@@ -0,0 +1,33 @@ | |||||
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
// generated by gen_cutlass_matrix_mul_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#pragma GCC diagnostic ignored "-Wuninitialized" | |||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||||
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" | |||||
using LayoutA = cutlass::layout::ColumnMajor; | |||||
using LayoutB = cutlass::layout::RowMajor; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; | |||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>; | |||||
using Gemm = cutlass::gemm::device::GemmSplitKParallel< | |||||
float, LayoutA, | |||||
float, LayoutB, | |||||
float, cutlass::layout::RowMajor, float, | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm50, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>( | |||||
const typename Gemm::ElementA* d_A, size_t lda, | |||||
const typename Gemm::ElementB* d_B, size_t ldb, | |||||
typename Gemm::ElementC* d_C, size_t ldc, | |||||
int* workspace, | |||||
cutlass::gemm::GemmCoord const& problem_size, | |||||
typename Gemm::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, int split_k_slices); | |||||
#pragma GCC diagnostic pop | |||||
#endif |