From 973d2a0ac27d736a376c474ea9c1830beb5d3cdc Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 13 Jan 2021 16:42:34 +0800 Subject: [PATCH] feat(dnn/cuda): add cutlass matmul using split k parallel GitOrigin-RevId: 650209e35f813e8eb8373d2ddc1671d3abb1759e --- dnn/scripts/Makefile | 4 +- dnn/src/cuda/matrix_mul/algos.cpp | 20 +++ dnn/src/cuda/matrix_mul/algos.h | 27 +++ .../matrix_mul/cutlass_float32_simt_split_k.cpp | 76 +++++++++ .../cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu | 190 ++++++++++++++++----- .../cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh | 6 +- .../matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu | 4 +- ...32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu | 4 +- ...32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu | 4 +- ...32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu | 4 +- ...32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu | 4 +- ...p32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu | 4 +- ...p32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu | 4 +- ...p32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu | 4 +- ...p32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu | 4 +- ...p32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu | 4 +- ...p32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu | 4 +- ...p32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu | 4 +- ...p32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu | 4 +- ...p32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu | 4 +- ...p32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu | 4 +- ...p32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu | 4 +- ...p32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu | 4 +- ...fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu | 4 +- ...fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu | 4 +- ...fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu | 4 +- ...fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu | 4 +- ...fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu | 4 +- ...fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu | 4 +- ...fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu | 4 +- ...fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu | 4 +- ...p32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu | 4 +- ...p32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu | 4 +- ...p32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu | 4 +- ...p32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu | 4 +- ...p32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu | 4 +- ...p32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu | 4 +- ...p32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu | 4 +- ...p32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu | 4 +- ...p32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu | 4 +- ...p32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu | 4 +- ...p32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu | 4 +- ...p32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu | 4 +- ...p32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu | 4 +- ...p32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu | 4 +- ...p32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu | 4 +- ...p32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu | 4 +- ...fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu | 4 +- ...fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu | 4 +- ...fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu | 4 +- ...fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu | 4 +- ...fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu | 4 +- ...fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu | 4 +- ...fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu | 4 +- ...fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu | 4 +- ...p32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu | 4 +- ...p32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu | 4 +- ...p32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu | 4 +- ...p32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu | 4 +- ...p32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu | 4 +- ...p32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu | 4 +- ...p32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu | 4 +- ...p32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu | 4 +- ...fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu | 4 +- ...fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu | 4 +- ...fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu | 4 +- ...fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu | 4 +- ...fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu | 4 +- ...fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu | 4 +- ...fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu | 33 ++++ .../matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu | 4 +- ...fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu | 33 ++++ .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu | 4 +- ...l_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu | 33 ++++ .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu | 4 +- ...l_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu | 33 ++++ .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu | 4 +- ...l_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu | 33 ++++ .../kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu | 4 +- ...l_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu | 33 ++++ .../matrix_mul_float_simt_cutlass_wrapper.cuinl | 29 ++-- dnn/src/cuda/matrix_mul/opr_impl.h | 1 + dnn/test/common/matrix_mul.cpp | 12 ++ dnn/test/common/matrix_mul.h | 1 + dnn/test/cuda/cutlass_matmul.cpp | 25 ++- 147 files changed, 2776 insertions(+), 131 deletions(-) create mode 100644 dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu create mode 100644 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile index bc28b4ce..b093b4ae 100644 --- a/dnn/scripts/Makefile +++ b/dnn/scripts/Makefile @@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ ../src/cuda/elemwise_multi_type/kimpl CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl -CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl +CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl -all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL) +all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py ./$^ $@ diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp index d2c44734..fa190980 100644 --- a/dnn/src/cuda/matrix_mul/algos.cpp +++ b/dnn/src/cuda/matrix_mul/algos.cpp @@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { for (auto&& algo : simt_float32) { all_algos.push_back(&algo); } + for (auto&& algo : simt_float32_split_k) { + all_algos.push_back(&algo); + } for (auto&& algo : all_algos) { m_all_algos_map.emplace(algo->info().desc, algo); @@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8}); + simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8}); } MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h index 65429370..d647c661 100644 --- a/dnn/src/cuda/matrix_mul/algos.h +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -43,6 +43,7 @@ public: CUDA_NAIVE, CUDA_BFLOAT16, CUDA_FLOAT32_SIMT, + CUDA_FLOAT32_SIMT_SPLIT_K, }; using Mapper = std::unordered_map; @@ -198,6 +199,31 @@ private: std::string m_name; }; +class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase { +public: + using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam; + AlgoFloat32SIMTSplitK(AlgoParam algo_param) + : m_algo_param{algo_param}, + m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", + m_algo_param.to_string().c_str())} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { return true; } + MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) + + std::string param() const override { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; + } + +private: + AlgoParam m_algo_param; + std::string m_name; +}; + class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { private: AlgoBase::Mapper m_all_algos_map; @@ -216,6 +242,7 @@ public: AlgoBFloat16 bfloat16; #endif std::vector simt_float32; + std::vector simt_float32_split_k; std::vector all_algos; const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp new file mode 100644 index 00000000..50ccb67d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/handle.h" +#include "src/cuda/matrix_mul/algos.h" +#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + return args.opr->param().format == param::MatrixMul::Format::DEFAULT && + args.layout_a.dtype == dtype::Float32() && + args.layout_b.dtype == dtype::Float32() && + args.layout_c.dtype == dtype::Float32() && k > std::max(m, n); +} + +size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes( + const SizeArgs& args) const { + size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0], + ldc = args.layout_c.stride[0]; + auto&& param = args.opr->param(); + int m = args.layout_c.shape[0], n = args.layout_c.shape[1], + k = args.layout_a.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + return cutlass_matrix_mul_float32_simt_get_workspace_size( + param.transposeA, lda, param.transposeB, ldb, ldc, problem_size, + 1.f, 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + split_k_slices); +} + +void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec( + const ExecArgs& args) const { + size_t lda = args.tensor_a.layout.stride[0], + ldb = args.tensor_b.layout.stride[0], + ldc = args.tensor_c.layout.stride[0]; + auto&& param = args.opr->param(); + int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], + k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; + GemmCoord problem_size{m, n, k}; + int split_k_slices = k / std::max(m, n); + auto&& stream = cuda_stream(args.opr->handle()); + int* workspace = reinterpret_cast(args.workspace.raw_ptr); + return cutlass_matrix_mul_float32_simt( + args.tensor_a.ptr(), param.transposeA, lda, + args.tensor_b.ptr(), param.transposeB, ldb, + args.tensor_c.ptr(), ldc, workspace, problem_size, 1.f, + 0.f, + GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + stream, split_k_slices); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu index 6f6e079c..4907b4fa 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu @@ -18,6 +18,7 @@ #if __CUDACC_VER_MAJOR__ > 9 || \ (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) #include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_splitk_parallel.h" #endif #include "src/common/opr_param_defs_enumv.cuh" #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" @@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( float* /* d_C */, size_t /* ldc */, int* /* workspace */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */, + int /* split_k_slices */) {} #else void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream) { + cudaStream_t stream, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt( workspace, problem_size, \ epilogue, stream); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + return cutlass_matrix_mul_wrapper( \ + d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size, \ + epilogue, stream, split_k_slices); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif @@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper:: bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */, GemmCoord const& /* problem_size */, float /* alpha */, float /* beta */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */) { + const GemmCoord& /* warp_shape */, int /* split_k_slices */) { return 0; } #else @@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper:: bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape) { + const GemmCoord& warp_shape, int split_k_slices) { + static constexpr int kEpilogueElementsPerAccess = 1; + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + float, kEpilogueElementsPerAccess, float, float>; + typename EpilogueOp::Params epilogue{alpha, beta}; + if (split_k_slices == 1) { #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ warp_k_) \ if (threadblock_shape.m() == threadblock_m_ && \ @@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper:: split_k_slices}; \ return Gemm::get_workspace_size(arguments); \ } - static constexpr int kEpilogueElementsPerAccess = 1; - static constexpr int split_k_slices = 1; - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - float, kEpilogueElementsPerAccess, float, float>; - typename EpilogueOp::Params epilogue{alpha, beta}; - if (!transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) - } else if (!transpose_A && transpose_B) { - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) - } else if (transpose_A && !transpose_B) { - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - DISPATCH(cb) + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } +#undef cb } else { - megdnn_assert(transpose_A && transpose_B); - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::ColumnMajor; - DISPATCH(cb) +#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \ + using Gemm = cutlass::gemm::device::GemmSplitKParallel< \ + float, LayoutA, float, LayoutB, float, \ + cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \ + cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \ + InstructionShape, EpilogueOp>; \ + using TensorRefA = cutlass::TensorRef; \ + using TensorRefB = cutlass::TensorRef; \ + using TensorRefC = cutlass::TensorRef; \ + using TensorRefD = cutlass::TensorRef; \ + TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast(lda)}}; \ + TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast(ldb)}}; \ + TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast(ldc)}}; \ + typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \ + tensor_C, tensor_D, epilogue, \ + split_k_slices}; \ + return Gemm::get_workspace_size(arguments); \ } + if (!transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else if (!transpose_A && transpose_B) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } else if (transpose_A && !transpose_B) { + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + DISPATCH(cb) + } else { + megdnn_assert(transpose_A && transpose_B); + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + DISPATCH(cb) + } #undef cb + } } #endif diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh index 3446842c..1947f773 100644 --- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh +++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh @@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper( typename Gemm::ElementC* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); void cutlass_matrix_mul_float32_simt( const float* d_A, bool transpose_A, size_t lda, const float* d_B, bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, float alpha, float beta, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices = 1); size_t cutlass_matrix_mul_float32_simt_get_workspace_size( bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc, GemmCoord const& problem_size, float alpha, float beta, - const GemmCoord& threadblock_shape, const GemmCoord& warp_shape); + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1); } // namespace cutlass_wrapper } // namespace cuda diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu index bdf8a6e3..38284233 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..a4b4e0c4 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu index 842f673e..d69e3359 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..e78d64a9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu index 6b4ec037..0fe5a161 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..964ef525 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu index a7b3fbea..374a8d73 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..d5795b04 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu index d4cfccda..87d80fd4 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..74b75ebd --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu index e4e097cc..230297e8 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..30dd6ad0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu index 35c35bb9..04f80d33 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..d702c06b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu index 9df59543..70fd338f 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..a54b66d8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu index 474114d8..bed908a1 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..91c76b87 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu index 9b39434a..a063706f 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..161b9e55 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu index b55f2e2f..6eae3c18 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..c8968eb3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu index 95d263d6..3107bc36 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..22f13797 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu index c3a64d63..ce92e149 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..5c51f781 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu index 3f151dce..50f5e49f 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..9ccf1190 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu index 526dfd3e..28b32c91 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..e25e44e3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu index 5bc0ed7a..4e1a9f6c 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..f7f1fb69 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu index 816cff19..225cdf3b 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..0050f669 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu index e463f557..91c830c6 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..c2fe7bcb --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu index db1b8796..e3ba197f 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..fff368ea --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu index b31e1e82..9e41f582 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..511cd557 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu index 32f6726b..49de5607 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..07296250 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu index 34d88333..872b8ded 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..c7774d64 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu index ba1e14b1..a6178562 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..fccd72b0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu index dc13d44b..e5c3e2d2 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..b2fa6309 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu index fcd6e1a9..a85bae3d 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..cb855f55 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu index d88a6bd2..4d8cddb5 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..ca5408f4 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu index 6878509e..7880c3cb 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..6a77f8c7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu index 121ed9e0..6e396c45 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..a3a9ba6c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu index 33ddd12c..7e4b278b 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..05437d7d --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu index d074d4f8..61f578ad 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..55eac3eb --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu index b4529df8..0227b521 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..5a777e87 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu index cac23b2d..90a24e98 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..3b268760 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu index 7539d009..ccb3a6fb 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..57f71457 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu index 420a2271..296e163d 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..7d9dae19 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu index 37d9265e..c964aaf8 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..c6be5d7b --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu index 4ff6119f..9bb8ea8a 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..d5f9afb3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu index 52d10eb7..18047dc2 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..7a66c163 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu index 1067544a..86899145 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..78c0283e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu index 8dc734f1..c65df06e 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..fcb716b7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu index 965a98e2..ecd87aa3 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..0afda5f1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu index 11829eaf..ebb5a2f3 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..a678b28c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu index f302afbf..f330b6d7 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..44e8a1b1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu index 04753040..db6e22b9 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..55cee82c --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu index 11733646..161e1337 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..9269ac05 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu index b2c77ea6..929bcdc6 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..71aa87a1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu index 7aac31e0..cf467004 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..88f5c826 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu index 19ff9eb4..9fea5074 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..86c8a6e0 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu index 601b715b..9976be6b 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..b452d8ed --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu index bd4dbe66..32175db5 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..a19eb570 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu index b78e7892..1ead9917 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..71c9cab8 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu index bffbaca9..919aad63 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..541af533 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu index 001247ca..f4928b1e 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..593a73b6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu index 4b744197..2ff883c0 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..c081e366 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu index 5c43eb1a..f6f214de 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..997a5b23 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu index 203bd895..67a7c764 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..f918819a --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu index c353c7a2..a27a779c 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..5d8c09c7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu index 83930042..48bd2b06 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..c81cadd1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu index 30e83753..465b736e 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..e31365d5 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu index 9dbb5d1b..023bd242 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..5b89b5f6 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu index fa6ac73b..927984eb 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..f4638ce1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu index c2e26af4..2bef3b7d 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..1ed27985 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu index 75404041..576d6663 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..a600832e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu index ab0a848d..c4414a1c 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..7f48f96f --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu index 261779d8..a4831f90 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..ac2dea60 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu index 8312b40c..ce7de93b 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu new file mode 100644 index 00000000..b8319cd7 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu index 6ad1b86b..254272c0 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::RowMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu new file mode 100644 index 00000000..0b3cdc14 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu index 0f5579bd..ea150057 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu new file mode 100644 index 00000000..bd15a9c3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::RowMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu index ea2147c6..58b093d4 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu @@ -5,6 +5,7 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" using LayoutA = cutlass::layout::ColumnMajor; @@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( int* workspace, cutlass::gemm::GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); + cudaStream_t stream, int split_k_slices); + #pragma GCC diagnostic pop #endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu new file mode 100644 index 00000000..e4a2e18e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt_splitk_parallel.cu @@ -0,0 +1,33 @@ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// generated by gen_cutlass_matrix_mul_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl" + +using LayoutA = cutlass::layout::ColumnMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>; +using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; +using EpilogueOp = cutlass::epilogue::thread::LinearCombination; +using Gemm = cutlass::gemm::device::GemmSplitKParallel< + float, LayoutA, + float, LayoutB, + float, cutlass::layout::RowMajor, float, + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>; +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Gemm::ElementA* d_A, size_t lda, + const typename Gemm::ElementB* d_B, size_t ldb, + typename Gemm::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Gemm::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); + +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl index d58c8048..4610a8f6 100644 --- a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl @@ -11,6 +11,7 @@ * implied. */ #include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_splitk_parallel.h" #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" using namespace megdnn; @@ -24,17 +25,21 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( typename Gemm::ElementC* d_C, size_t ldc, int* workspace, GemmCoord const& problem_size, typename Gemm::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream) { - typename Gemm::TensorRefA tensor_a{ - const_cast(d_A), - typename Gemm::LayoutA{static_cast(lda)}}; - typename Gemm::TensorRefB tensor_b{ - const_cast(d_B), - typename Gemm::LayoutB{static_cast(ldb)}}; - typename Gemm::TensorRefC tensor_c{ - nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; - typename Gemm::TensorRefD tensor_d{ - d_C, typename Gemm::LayoutC{static_cast(ldc)}}; + cudaStream_t stream, int split_k_slices) { + using TensorRefA = cutlass::TensorRef; + using TensorRefB = cutlass::TensorRef; + using TensorRefC = cutlass::TensorRef; + using TensorRefD = + cutlass::TensorRef; + TensorRefA tensor_a{const_cast(d_A), + typename Gemm::LayoutA{static_cast(lda)}}; + TensorRefB tensor_b{const_cast(d_B), + typename Gemm::LayoutB{static_cast(ldb)}}; + TensorRefC tensor_c{nullptr, typename Gemm::LayoutC{static_cast(ldc)}}; + TensorRefD tensor_d{d_C, typename Gemm::LayoutC{static_cast(ldc)}}; typename Gemm::Arguments arguments{problem_size, tensor_a, @@ -42,7 +47,7 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( tensor_c, tensor_d.non_const_ref(), epilogue, - 1}; + split_k_slices}; Gemm gemm_op; cutlass_check(gemm_op.initialize(arguments, workspace)); cutlass_check(gemm_op(stream)); diff --git a/dnn/src/cuda/matrix_mul/opr_impl.h b/dnn/src/cuda/matrix_mul/opr_impl.h index 56215411..b554a9ea 100644 --- a/dnn/src/cuda/matrix_mul/opr_impl.h +++ b/dnn/src/cuda/matrix_mul/opr_impl.h @@ -42,6 +42,7 @@ public: class AlgoBFloat16; #endif class AlgoFloat32SIMT; + class AlgoFloat32SIMTSplitK; class AlgoPack; static const AlgoPack& algo_pack() { diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp index 4cbe5c77..00b63cdd 100644 --- a/dnn/test/common/matrix_mul.cpp +++ b/dnn/test/common/matrix_mul.cpp @@ -117,6 +117,18 @@ std::vector matrix_mul::get_matmul_args() { return args; } +std::vector matrix_mul::get_matmul_args_split_k() { + std::vector args = get_matmul_args(); + for (auto iter = args.begin(); iter < args.end();) { + if (iter->k <= iter->n) { + iter = args.erase(iter); + } else { + iter++; + } + } + return args; +} + std::vector matrix_mul::get_batched_matmul_args_mask( uint8_t mask) { std::vector args; diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h index d52f1814..ab3057e0 100644 --- a/dnn/test/common/matrix_mul.h +++ b/dnn/test/common/matrix_mul.h @@ -53,6 +53,7 @@ struct TestArg { std::vector get_matmul_args_no_mask(); std::vector get_matmul_args_mask(uint8_t mask); std::vector get_matmul_args(); +std::vector get_matmul_args_split_k(); std::vector get_batched_matmul_args_mask(uint8_t mask); std::vector get_batched_matmul_args(); std::vector get_batched_matmul_broadcast_args(); diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp index 55b13c28..ae04cd02 100644 --- a/dnn/test/cuda/cutlass_matmul.cpp +++ b/dnn/test/cuda/cutlass_matmul.cpp @@ -21,7 +21,6 @@ #include "test/cuda/fixture.h" #include "test/cuda/utils.h" - #if CUDA_VERSION >= 9020 namespace megdnn { namespace test { @@ -284,6 +283,15 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { param::MatrixMul::Format::DEFAULT); } +TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_MULTI_BATCHSIZE) { + auto args = matrix_mul::get_matmul_args_no_mask(); + test_multibatchsize( + handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), + "CUTLASS_FLOAT32_SIMT_SPLIT_K_128X128X8_32X64X8", args, + param::MatrixMul::Format::DEFAULT, + [](const matrix_mul::TestArg& arg) { return arg.k <= arg.n; }); +} + #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 64, 256, 8, 32, 64, 8); \ cb(2, 256, 64, 8, 64, 32, 8); \ @@ -315,6 +323,21 @@ TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb + +#define cb(name, tbm, tbn, tbk, wm, wn, wk) \ + TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_##name) { \ + matrix_mul::check_matrix_mul( \ + dtype::Float32(), dtype::Float32(), dtype::Float32(), \ + handle_cuda(), \ + "CUTLASS_FLOAT32_SIMT_SPLIT_K_" #tbm "X" #tbn "X" #tbk "_" #wm \ + "X" #wn "X" #wk, \ + param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ + matrix_mul::get_matmul_args_split_k()); \ + } + +MEGDNN_FOREACH_CUTLASS_KERNEL(cb) + +#undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL #if MEGDNN_WITH_BENCHMARK