From 973d2a0ac27d736a376c474ea9c1830beb5d3cdc Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Wed, 13 Jan 2021 16:42:34 +0800
Subject: [PATCH] feat(dnn/cuda): add cutlass matmul using split k parallel

GitOrigin-RevId: 650209e35f813e8eb8373d2ddc1671d3abb1759e
---
 dnn/scripts/Makefile                               |   4 +-
 dnn/src/cuda/matrix_mul/algos.cpp                  |  20 +++
 dnn/src/cuda/matrix_mul/algos.h                    |  27 +++
 .../matrix_mul/cutlass_float32_simt_split_k.cpp    |  76 +++++++++
 .../cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu  | 190 ++++++++++++++++-----
 .../cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh |   6 +-
 .../matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu   |   4 +-
 ...32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu   |   4 +-
 ...32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu   |   4 +-
 ...32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu   |   4 +-
 ...32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu    |   4 +-
 ...p32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu    |   4 +-
 ...p32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu    |   4 +-
 ...p32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu    |   4 +-
 ...p32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu    |   4 +-
 ...p32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu    |   4 +-
 ...p32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu    |   4 +-
 ...p32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu    |   4 +-
 ...p32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu    |   4 +-
 ...p32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu    |   4 +-
 ...p32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu    |   4 +-
 ...p32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu    |   4 +-
 ...p32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu     |   4 +-
 ...fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu     |   4 +-
 ...fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu     |   4 +-
 ...fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu     |   4 +-
 ...fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu     |   4 +-
 ...fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu     |   4 +-
 ...fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu     |   4 +-
 ...fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu     |   4 +-
 ...fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu    |   4 +-
 ...p32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu    |   4 +-
 ...p32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu    |   4 +-
 ...p32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu    |   4 +-
 ...p32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu    |   4 +-
 ...p32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu    |   4 +-
 ...p32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu    |   4 +-
 ...p32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu    |   4 +-
 ...p32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu    |   4 +-
 ...p32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu    |   4 +-
 ...p32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu    |   4 +-
 ...p32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu    |   4 +-
 ...p32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu    |   4 +-
 ...p32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu    |   4 +-
 ...p32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu    |   4 +-
 ...p32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu    |   4 +-
 ...p32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu     |   4 +-
 ...fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu     |   4 +-
 ...fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu     |   4 +-
 ...fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu     |   4 +-
 ...fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu     |   4 +-
 ...fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu     |   4 +-
 ...fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu     |   4 +-
 ...fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu     |   4 +-
 ...fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu    |   4 +-
 ...p32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu    |   4 +-
 ...p32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu    |   4 +-
 ...p32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu    |   4 +-
 ...p32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu    |   4 +-
 ...p32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu    |   4 +-
 ...p32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu    |   4 +-
 ...p32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu    |   4 +-
 ...p32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu     |   4 +-
 ...fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu     |   4 +-
 ...fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu     |   4 +-
 ...fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu     |   4 +-
 ...fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu     |   4 +-
 ...fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu     |   4 +-
 ...fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu     |   4 +-
 ...fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu |  33 ++++
 .../matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu     |   4 +-
 ...fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu |  33 ++++
 .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu |   4 +-
 ...l_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu |  33 ++++
 .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu |   4 +-
 ...l_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu |  33 ++++
 .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu |   4 +-
 ...l_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu |  33 ++++
 .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu |   4 +-
 ...l_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu |  33 ++++
 .../matrix_mul_float_simt_cutlass_wrapper.cuinl    |  29 ++--
 dnn/src/cuda/matrix_mul/opr_impl.h                 |   1 +
 dnn/test/common/matrix_mul.cpp                     |  12 ++
 dnn/test/common/matrix_mul.h                       |   1 +
 dnn/test/cuda/cutlass_matmul.cpp                   |  25 ++-
 147 files changed, 2776 insertions(+), 131 deletions(-)
 create mode 100644 dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu
 create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu

diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile
index bc28b4ce..b093b4ae 100644
--- a/dnn/scripts/Makefile
+++ b/dnn/scripts/Makefile
@@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \
 	../src/cuda/elemwise_multi_type/kimpl
 
 CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl 
-CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl
+CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl
 
-all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL)
+all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL)
 
 ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py
 	./$^ $@
diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp
index d2c44734..fa190980 100644
--- a/dnn/src/cuda/matrix_mul/algos.cpp
+++ b/dnn/src/cuda/matrix_mul/algos.cpp
@@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
     for (auto&& algo : simt_float32) {
         all_algos.push_back(&algo);
     }
+    for (auto&& algo : simt_float32_split_k) {
+        all_algos.push_back(&algo);
+    }
 
     for (auto&& algo : all_algos) {
         m_all_algos_map.emplace(algo->info().desc, algo);
@@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
     simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
     simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
     simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
+    simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
 }
 
 MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack;
diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h
index 65429370..d647c661 100644
--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -43,6 +43,7 @@ public:
         CUDA_NAIVE,
         CUDA_BFLOAT16, 
         CUDA_FLOAT32_SIMT, 
+        CUDA_FLOAT32_SIMT_SPLIT_K, 
     };
     using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
 
@@ -198,6 +199,31 @@ private:
     std::string m_name;
 };
 
+class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase {
+public:
+    using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam;
+    AlgoFloat32SIMTSplitK(AlgoParam algo_param)
+            : m_algo_param{algo_param},
+              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s",
+                              m_algo_param.to_string().c_str())} {}
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    const char* name() const override { return m_name.c_str(); }
+    void exec(const ExecArgs& args) const override;
+    bool is_reproducible() const override { return true; }
+    MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K)
+
+    std::string param() const override {
+        std::string ret;
+        serialize_write_pod(m_algo_param, ret);
+        return ret;
+    }
+
+private:
+    AlgoParam m_algo_param;
+    std::string m_name;
+};
+
 class MatrixMulForwardImpl::AlgoPack : NonCopyableObj {
 private:
     AlgoBase::Mapper m_all_algos_map;
@@ -216,6 +242,7 @@ public:
     AlgoBFloat16 bfloat16;
 #endif
     std::vector<AlgoFloat32SIMT> simt_float32;
+    std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
     std::vector<AlgoBase*> all_algos;
 
     const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
new file mode 100644
index 00000000..50ccb67d
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
@@ -0,0 +1,76 @@
+/**
+ * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/cuda/handle.h"
+#include "src/cuda/matrix_mul/algos.h"
+#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
+#include "src/cuda/utils.h"
+
+using namespace megdnn;
+using namespace cuda;
+using namespace cutlass_wrapper;
+
+bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
+        const SizeArgs& args) const {
+    auto&& param = args.opr->param();
+    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
+        k = args.layout_a.shape[param.transposeA ? 0 : 1];
+    return args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
+           args.layout_a.dtype == dtype::Float32() &&
+           args.layout_b.dtype == dtype::Float32() &&
+           args.layout_c.dtype == dtype::Float32() && k > std::max(m, n);
+}
+
+size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0],
+           ldc = args.layout_c.stride[0];
+    auto&& param = args.opr->param();
+    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
+        k = args.layout_a.shape[param.transposeA ? 0 : 1];
+    GemmCoord problem_size{m, n, k};
+    int split_k_slices = k / std::max(m, n);
+    return cutlass_matrix_mul_float32_simt_get_workspace_size(
+            param.transposeA, lda, param.transposeB, ldb, ldc, problem_size,
+            1.f, 0.f,
+            GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
+                      m_algo_param.threadblock_k},
+            GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
+                      m_algo_param.warp_k},
+            split_k_slices);
+}
+
+void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec(
+        const ExecArgs& args) const {
+    size_t lda = args.tensor_a.layout.stride[0],
+           ldb = args.tensor_b.layout.stride[0],
+           ldc = args.tensor_c.layout.stride[0];
+    auto&& param = args.opr->param();
+    int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1],
+        k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1];
+    GemmCoord problem_size{m, n, k};
+    int split_k_slices = k / std::max(m, n);
+    auto&& stream = cuda_stream(args.opr->handle());
+    int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr);
+    return cutlass_matrix_mul_float32_simt(
+            args.tensor_a.ptr<dt_float32>(), param.transposeA, lda,
+            args.tensor_b.ptr<dt_float32>(), param.transposeB, ldb,
+            args.tensor_c.ptr<dt_float32>(), ldc, workspace, problem_size, 1.f,
+            0.f,
+            GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
+                      m_algo_param.threadblock_k},
+            GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
+                      m_algo_param.warp_k},
+            stream, split_k_slices);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
index 6f6e079c..4907b4fa 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
@@ -18,6 +18,7 @@
 #if __CUDACC_VER_MAJOR__ > 9 || \
         (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 #include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_splitk_parallel.h"
 #endif
 #include "src/common/opr_param_defs_enumv.cuh"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
@@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
         float* /* d_C */, size_t /* ldc */, int* /* workspace */,
         GemmCoord const& /* problem_size */, float /* alpha */,
         float /* beta */, const GemmCoord& /* threadblock_shape */,
-        const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
+        const GemmCoord& /* warp_shape */, cudaStream_t /* stream */,
+        int /* split_k_slices */) {}
 #else
 void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
         const float* d_A, bool transpose_A, size_t lda, const float* d_B,
         bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
         GemmCoord const& problem_size, float alpha, float beta,
         const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
-        cudaStream_t stream) {
+        cudaStream_t stream, int split_k_slices) {
+    static constexpr int kEpilogueElementsPerAccess = 1;
+    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+            float, kEpilogueElementsPerAccess, float, float>;
+    typename EpilogueOp::Params epilogue{alpha, beta};
+    if (split_k_slices == 1) {
 #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
            warp_k_)                                                            \
     if (threadblock_shape.m() == threadblock_m_ &&                             \
@@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
                                                 workspace, problem_size,       \
                                                 epilogue, stream);             \
     }
-    static constexpr int kEpilogueElementsPerAccess = 1;
-    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-            float, kEpilogueElementsPerAccess, float, float>;
-    typename EpilogueOp::Params epilogue{alpha, beta};
-    if (!transpose_A && !transpose_B) {
-        using LayoutA = cutlass::layout::RowMajor;
-        using LayoutB = cutlass::layout::RowMajor;
-        DISPATCH(cb)
-    } else if (!transpose_A && transpose_B) {
-        using LayoutA = cutlass::layout::RowMajor;
-        using LayoutB = cutlass::layout::ColumnMajor;
-        DISPATCH(cb)
-    } else if (transpose_A && !transpose_B) {
-        using LayoutA = cutlass::layout::ColumnMajor;
-        using LayoutB = cutlass::layout::RowMajor;
-        DISPATCH(cb)
+        if (!transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else if (!transpose_A && transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        } else if (transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else {
+            megdnn_assert(transpose_A && transpose_B);
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        }
+#undef cb
     } else {
-        megdnn_assert(transpose_A && transpose_B);
-        using LayoutA = cutlass::layout::ColumnMajor;
-        using LayoutB = cutlass::layout::ColumnMajor;
-        DISPATCH(cb)
+#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
+           warp_k_)                                                            \
+    if (threadblock_shape.m() == threadblock_m_ &&                             \
+        threadblock_shape.n() == threadblock_n_ &&                             \
+        threadblock_shape.k() == threadblock_k_ &&                             \
+        warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
+        warp_shape.k() == warp_k_) {                                           \
+        using ThreadBlockShape =                                               \
+                cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
+                                         threadblock_k_>;                      \
+        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
+        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;            \
+        using Gemm = cutlass::gemm::device::GemmSplitKParallel<                \
+                float, LayoutA, float, LayoutB, float,                         \
+                cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt,  \
+                cutlass::arch::Sm50, ThreadBlockShape, WarpShape,              \
+                InstructionShape, EpilogueOp>;                                 \
+        return cutlass_matrix_mul_wrapper<Gemm>(                               \
+                d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size,         \
+                epilogue, stream, split_k_slices);                             \
     }
+        if (!transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else if (!transpose_A && transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        } else if (transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else {
+            megdnn_assert(transpose_A && transpose_B);
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        }
 #undef cb
+    }
 }
 #endif
 
@@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper::
                 bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */,
                 GemmCoord const& /* problem_size */, float /* alpha */,
                 float /* beta */, const GemmCoord& /* threadblock_shape */,
-                const GemmCoord& /* warp_shape */) {
+                const GemmCoord& /* warp_shape */, int /* split_k_slices */) {
     return 0;
 }
 #else
@@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper::
                 bool transpose_A, size_t lda, bool transpose_B, size_t ldb,
                 size_t ldc, GemmCoord const& problem_size, float alpha,
                 float beta, const GemmCoord& threadblock_shape,
-                const GemmCoord& warp_shape) {
+                const GemmCoord& warp_shape, int split_k_slices) {
+    static constexpr int kEpilogueElementsPerAccess = 1;
+    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+            float, kEpilogueElementsPerAccess, float, float>;
+    typename EpilogueOp::Params epilogue{alpha, beta};
+    if (split_k_slices == 1) {
 #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
            warp_k_)                                                            \
     if (threadblock_shape.m() == threadblock_m_ &&                             \
@@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper::
                                            split_k_slices};                    \
         return Gemm::get_workspace_size(arguments);                            \
     }
-    static constexpr int kEpilogueElementsPerAccess = 1;
-    static constexpr int split_k_slices = 1;
-    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-            float, kEpilogueElementsPerAccess, float, float>;
-    typename EpilogueOp::Params epilogue{alpha, beta};
-    if (!transpose_A && !transpose_B) {
-        using LayoutA = cutlass::layout::RowMajor;
-        using LayoutB = cutlass::layout::RowMajor;
-        DISPATCH(cb)
-    } else if (!transpose_A && transpose_B) {
-        using LayoutA = cutlass::layout::RowMajor;
-        using LayoutB = cutlass::layout::ColumnMajor;
-        DISPATCH(cb)
-    } else if (transpose_A && !transpose_B) {
-        using LayoutA = cutlass::layout::ColumnMajor;
-        using LayoutB = cutlass::layout::RowMajor;
-        DISPATCH(cb)
+        if (!transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else if (!transpose_A && transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        } else if (transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else {
+            megdnn_assert(transpose_A && transpose_B);
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        }
+#undef cb
     } else {
-        megdnn_assert(transpose_A && transpose_B);
-        using LayoutA = cutlass::layout::ColumnMajor;
-        using LayoutB = cutlass::layout::ColumnMajor;
-        DISPATCH(cb)
+#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
+           warp_k_)                                                            \
+    if (threadblock_shape.m() == threadblock_m_ &&                             \
+        threadblock_shape.n() == threadblock_n_ &&                             \
+        threadblock_shape.k() == threadblock_k_ &&                             \
+        warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
+        warp_shape.k() == warp_k_) {                                           \
+        using ThreadBlockShape =                                               \
+                cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
+                                         threadblock_k_>;                      \
+        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
+        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;            \
+        using Gemm = cutlass::gemm::device::GemmSplitKParallel<                \
+                float, LayoutA, float, LayoutB, float,                         \
+                cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt,  \
+                cutlass::arch::Sm50, ThreadBlockShape, WarpShape,              \
+                InstructionShape, EpilogueOp>;                                 \
+        using TensorRefA = cutlass::TensorRef<typename Gemm::ElementA const,   \
+                                              typename Gemm::LayoutA>;         \
+        using TensorRefB = cutlass::TensorRef<typename Gemm::ElementB const,   \
+                                              typename Gemm::LayoutB>;         \
+        using TensorRefC = cutlass::TensorRef<typename Gemm::ElementC const,   \
+                                              typename Gemm::LayoutC>;         \
+        using TensorRefD = cutlass::TensorRef<typename Gemm::ElementC,         \
+                                              typename Gemm::LayoutC>;         \
+        TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast<int>(lda)}};    \
+        TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast<int>(ldb)}};    \
+        TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}};    \
+        TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}};    \
+        typename Gemm::Arguments arguments{problem_size,  tensor_A, tensor_B,  \
+                                           tensor_C,      tensor_D, epilogue,  \
+                                           split_k_slices};                    \
+        return Gemm::get_workspace_size(arguments);                            \
     }
+        if (!transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else if (!transpose_A && transpose_B) {
+            using LayoutA = cutlass::layout::RowMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        } else if (transpose_A && !transpose_B) {
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::RowMajor;
+            DISPATCH(cb)
+        } else {
+            megdnn_assert(transpose_A && transpose_B);
+            using LayoutA = cutlass::layout::ColumnMajor;
+            using LayoutB = cutlass::layout::ColumnMajor;
+            DISPATCH(cb)
+        }
 #undef cb
+    }
 }
 #endif
 
diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
index 3446842c..1947f773 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
@@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper(
         typename Gemm::ElementC* d_C, size_t ldc, int* workspace,
         GemmCoord const& problem_size,
         typename Gemm::EpilogueOutputOp::Params const& epilogue,
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices = 1);
 
 void cutlass_matrix_mul_float32_simt(
         const float* d_A, bool transpose_A, size_t lda, const float* d_B,
         bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
         GemmCoord const& problem_size, float alpha, float beta,
         const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices = 1);
 
 size_t cutlass_matrix_mul_float32_simt_get_workspace_size(
         bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc,
         GemmCoord const& problem_size, float alpha, float beta,
-        const GemmCoord& threadblock_shape, const GemmCoord& warp_shape);
+        const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1);
 
 }  // namespace cutlass_wrapper
 }  // namespace cuda
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
index bdf8a6e3..38284233 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..a4b4e0c4
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
index 842f673e..d69e3359 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..e78d64a9
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
index 6b4ec037..0fe5a161 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..964ef525
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
index a7b3fbea..374a8d73 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..d5795b04
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
index d4cfccda..87d80fd4 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..74b75ebd
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
index e4e097cc..230297e8 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..30dd6ad0
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
index 35c35bb9..04f80d33 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..d702c06b
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
index 9df59543..70fd338f 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..a54b66d8
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
index 474114d8..bed908a1 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..91c76b87
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
index 9b39434a..a063706f 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..161b9e55
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
index b55f2e2f..6eae3c18 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..c8968eb3
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
index 95d263d6..3107bc36 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..22f13797
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
index c3a64d63..ce92e149 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..5c51f781
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
index 3f151dce..50f5e49f 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..9ccf1190
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
index 526dfd3e..28b32c91 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..e25e44e3
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
index 5bc0ed7a..4e1a9f6c 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..f7f1fb69
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
index 816cff19..225cdf3b 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..0050f669
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
index e463f557..91c830c6 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..c2fe7bcb
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
index db1b8796..e3ba197f 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..fff368ea
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
index b31e1e82..9e41f582 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..511cd557
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
index 32f6726b..49de5607 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..07296250
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
index 34d88333..872b8ded 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..c7774d64
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
index ba1e14b1..a6178562 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..fccd72b0
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
index dc13d44b..e5c3e2d2 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..b2fa6309
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
index fcd6e1a9..a85bae3d 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..cb855f55
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
index d88a6bd2..4d8cddb5 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..ca5408f4
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
index 6878509e..7880c3cb 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..6a77f8c7
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
index 121ed9e0..6e396c45 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..a3a9ba6c
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
index 33ddd12c..7e4b278b 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..05437d7d
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
index d074d4f8..61f578ad 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..55eac3eb
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
index b4529df8..0227b521 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..5a777e87
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
index cac23b2d..90a24e98 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..3b268760
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
index 7539d009..ccb3a6fb 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..57f71457
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
index 420a2271..296e163d 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..7d9dae19
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
index 37d9265e..c964aaf8 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..c6be5d7b
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
index 4ff6119f..9bb8ea8a 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..d5f9afb3
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
index 52d10eb7..18047dc2 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..7a66c163
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
index 1067544a..86899145 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..78c0283e
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
index 8dc734f1..c65df06e 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..fcb716b7
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
index 965a98e2..ecd87aa3 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..0afda5f1
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
index 11829eaf..ebb5a2f3 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..a678b28c
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
index f302afbf..f330b6d7 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..44e8a1b1
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
index 04753040..db6e22b9 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..55cee82c
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
index 11733646..161e1337 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..9269ac05
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
index b2c77ea6..929bcdc6 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..71aa87a1
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
index 7aac31e0..cf467004 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..88f5c826
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
index 19ff9eb4..9fea5074 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..86c8a6e0
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu
index 601b715b..9976be6b 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..b452d8ed
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu
index bd4dbe66..32175db5 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..a19eb570
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu
index b78e7892..1ead9917 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..71c9cab8
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu
index bffbaca9..919aad63 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..541af533
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu
index 001247ca..f4928b1e 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..593a73b6
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu
index 4b744197..2ff883c0 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..c081e366
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu
index 5c43eb1a..f6f214de 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..997a5b23
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu
index 203bd895..67a7c764 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..f918819a
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu
index c353c7a2..a27a779c 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..5d8c09c7
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu
index 83930042..48bd2b06 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..c81cadd1
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu
index 30e83753..465b736e 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..e31365d5
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu
index 9dbb5d1b..023bd242 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..5b89b5f6
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu
index fa6ac73b..927984eb 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..f4638ce1
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu
index c2e26af4..2bef3b7d 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..1ed27985
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu
index 75404041..576d6663 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..a600832e
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu
index ab0a848d..c4414a1c 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..7f48f96f
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu
index 261779d8..a4831f90 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..ac2dea60
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
+using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu
index 8312b40c..ce7de93b 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu
new file mode 100644
index 00000000..b8319cd7
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu
index 6ad1b86b..254272c0 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu
new file mode 100644
index 00000000..0b3cdc14
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu
index 0f5579bd..ea150057 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu
new file mode 100644
index 00000000..bd15a9c3
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu
index ea2147c6..58b093d4 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 
 using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
         int* workspace, 
         cutlass::gemm::GemmCoord const& problem_size,   
         typename Gemm::EpilogueOutputOp::Params const& epilogue, 
-        cudaStream_t stream);
+        cudaStream_t stream, int split_k_slices);
+
 #pragma GCC diagnostic pop
 #endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu
new file mode 100644
index 00000000..e4a2e18e
--- /dev/null
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu
@@ -0,0 +1,33 @@
+#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+// generated by gen_cutlass_matrix_mul_kern_impls.py
+// ignore warning of cutlass
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
+
+using LayoutA = cutlass::layout::ColumnMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
+using Gemm = cutlass::gemm::device::GemmSplitKParallel<
+    float, LayoutA, 
+    float, LayoutB, 
+    float, cutlass::layout::RowMajor, float, 
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
+    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
+template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
+        const typename Gemm::ElementA* d_A, size_t lda, 
+        const typename Gemm::ElementB* d_B, size_t ldb,  
+        typename Gemm::ElementC* d_C, size_t ldc,  
+        int* workspace, 
+        cutlass::gemm::GemmCoord const& problem_size,   
+        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
+        cudaStream_t stream, int split_k_slices);
+
+#pragma GCC diagnostic pop
+#endif
diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl
index d58c8048..4610a8f6 100644
--- a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl
@@ -11,6 +11,7 @@
  * implied.
  */
 #include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_splitk_parallel.h"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
 
 using namespace megdnn;
@@ -24,17 +25,21 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper(
         typename Gemm::ElementC* d_C, size_t ldc, int* workspace,
         GemmCoord const& problem_size,
         typename Gemm::EpilogueOutputOp::Params const& epilogue,
-        cudaStream_t stream) {
-    typename Gemm::TensorRefA tensor_a{
-            const_cast<typename Gemm::ElementA*>(d_A),
-            typename Gemm::LayoutA{static_cast<int>(lda)}};
-    typename Gemm::TensorRefB tensor_b{
-            const_cast<typename Gemm::ElementB*>(d_B),
-            typename Gemm::LayoutB{static_cast<int>(ldb)}};
-    typename Gemm::TensorRefC tensor_c{
-            nullptr, typename Gemm::LayoutC{static_cast<int>(ldc)}};
-    typename Gemm::TensorRefD tensor_d{
-            d_C, typename Gemm::LayoutC{static_cast<int>(ldc)}};
+        cudaStream_t stream, int split_k_slices) {
+    using TensorRefA = cutlass::TensorRef<typename Gemm::ElementA const,
+                                          typename Gemm::LayoutA>;
+    using TensorRefB = cutlass::TensorRef<typename Gemm::ElementB const,
+                                          typename Gemm::LayoutB>;
+    using TensorRefC = cutlass::TensorRef<typename Gemm::ElementC const,
+                                          typename Gemm::LayoutC>;
+    using TensorRefD =
+            cutlass::TensorRef<typename Gemm::ElementC, typename Gemm::LayoutC>;
+    TensorRefA tensor_a{const_cast<typename Gemm::ElementA*>(d_A),
+                        typename Gemm::LayoutA{static_cast<int>(lda)}};
+    TensorRefB tensor_b{const_cast<typename Gemm::ElementB*>(d_B),
+                        typename Gemm::LayoutB{static_cast<int>(ldb)}};
+    TensorRefC tensor_c{nullptr, typename Gemm::LayoutC{static_cast<int>(ldc)}};
+    TensorRefD tensor_d{d_C, typename Gemm::LayoutC{static_cast<int>(ldc)}};
 
     typename Gemm::Arguments arguments{problem_size,
                                        tensor_a,
@@ -42,7 +47,7 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper(
                                        tensor_c,
                                        tensor_d.non_const_ref(),
                                        epilogue,
-                                       1};
+                                       split_k_slices};
     Gemm gemm_op;
     cutlass_check(gemm_op.initialize(arguments, workspace));
     cutlass_check(gemm_op(stream));
diff --git a/dnn/src/cuda/matrix_mul/opr_impl.h b/dnn/src/cuda/matrix_mul/opr_impl.h
index 56215411..b554a9ea 100644
--- a/dnn/src/cuda/matrix_mul/opr_impl.h
+++ b/dnn/src/cuda/matrix_mul/opr_impl.h
@@ -42,6 +42,7 @@ public:
     class AlgoBFloat16;
 #endif
     class AlgoFloat32SIMT;
+    class AlgoFloat32SIMTSplitK;
     class AlgoPack;
 
     static const AlgoPack& algo_pack() {
diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp
index 4cbe5c77..00b63cdd 100644
--- a/dnn/test/common/matrix_mul.cpp
+++ b/dnn/test/common/matrix_mul.cpp
@@ -117,6 +117,18 @@ std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args() {
     return args;
 }
 
+std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args_split_k() {
+    std::vector<TestArg> args = get_matmul_args();
+    for (auto iter = args.begin(); iter < args.end();) {
+        if (iter->k <= iter->n) {
+            iter = args.erase(iter);
+        } else {
+            iter++;
+        }
+    }
+    return args;
+}
+
 std::vector<matrix_mul::TestArg> matrix_mul::get_batched_matmul_args_mask(
         uint8_t mask) {
     std::vector<TestArg> args;
diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h
index d52f1814..ab3057e0 100644
--- a/dnn/test/common/matrix_mul.h
+++ b/dnn/test/common/matrix_mul.h
@@ -53,6 +53,7 @@ struct TestArg {
 std::vector<TestArg> get_matmul_args_no_mask();
 std::vector<TestArg> get_matmul_args_mask(uint8_t mask);
 std::vector<TestArg> get_matmul_args();
+std::vector<TestArg> get_matmul_args_split_k();
 std::vector<TestArg> get_batched_matmul_args_mask(uint8_t mask);
 std::vector<TestArg> get_batched_matmul_args();
 std::vector<TestArg> get_batched_matmul_broadcast_args();
diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp
index 55b13c28..ae04cd02 100644
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -21,7 +21,6 @@
 #include "test/cuda/fixture.h"
 #include "test/cuda/utils.h"
 
-
 #if CUDA_VERSION >= 9020
 namespace megdnn {
 namespace test {
@@ -284,6 +283,15 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) {
                         param::MatrixMul::Format::DEFAULT);
 }
 
+TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_MULTI_BATCHSIZE) {
+    auto args = matrix_mul::get_matmul_args_no_mask();
+    test_multibatchsize(
+            handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(),
+            "CUTLASS_FLOAT32_SIMT_SPLIT_K_128X128X8_32X64X8", args,
+            param::MatrixMul::Format::DEFAULT,
+            [](const matrix_mul::TestArg& arg) { return arg.k <= arg.n; });
+}
+
 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \
     cb(1, 64, 256, 8, 32, 64, 8);         \
     cb(2, 256, 64, 8, 64, 32, 8);         \
@@ -315,6 +323,21 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) {
 MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 
 #undef cb
+
+#define cb(name, tbm, tbn, tbk, wm, wn, wk)                                    \
+    TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_##name) {                                \
+        matrix_mul::check_matrix_mul<MatrixMulForward>(                        \
+                dtype::Float32(), dtype::Float32(), dtype::Float32(),          \
+                handle_cuda(),                                                 \
+                "CUTLASS_FLOAT32_SIMT_SPLIT_K_" #tbm "X" #tbn "X" #tbk "_" #wm \
+                "X" #wn "X" #wk,                                               \
+                param::MatrixMul::Format::DEFAULT, 8, 1e-3,                    \
+                matrix_mul::get_matmul_args_split_k());                        \
+    }
+
+MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
+
+#undef cb
 #undef MEGDNN_FOREACH_CUTLASS_KERNEL
 
 #if MEGDNN_WITH_BENCHMARK