From ff0e6be7b97f721e85be13996ceee0b8aec87cbb Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 16 Aug 2021 21:43:11 +0800
Subject: [PATCH] fix(dnn/cuda): fix cutlass tensorop kernels

do not compile cutlass tensorop kernels, when using cuda version less than 10.2

GitOrigin-RevId: d4c37d5f41ff6e20cf712149067594559529e6cf
---
 dnn/scripts/cutlass_generator/conv2d_operation.py  | 22 +++++++++----
 dnn/scripts/cutlass_generator/gemm_operation.py    | 37 +++++++++++++++-------
 dnn/scripts/cutlass_generator/generator.py         | 28 ++++++++++++----
 dnn/scripts/cutlass_generator/manifest.py          | 12 +++++--
 dnn/src/cuda/cutlass/initialize_all.cu             | 16 +++++++---
 dnn/src/cuda/matrix_mul/algos.cpp                  |  4 +++
 dnn/src/cuda/matrix_mul/algos.h                    | 34 +++++++++++++++++++-
 .../cuda/matrix_mul/cutlass_float16_tensorop.cpp   |  4 +--
 .../cutlass_float16_tensorop_split_k.cpp           | 10 +++---
 dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp   |  2 +-
 .../cutlass_float32_simt_gemv_batched_strided.cpp  |  2 +-
 .../matrix_mul/cutlass_float32_simt_split_k.cpp    |  8 ++---
 .../cuda/matrix_mul/cutlass_matrix_mul_base.cpp    |  2 +-
 dnn/test/cuda/cutlass_matmul.cpp                   |  5 ++-
 14 files changed, 139 insertions(+), 47 deletions(-)

diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py
index a931a8fb..735f4d94 100644
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
@@ -19,7 +19,8 @@ class Conv2dOperation:
   #
   def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
     epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
-    need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False):
+    need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
+    required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
 
     self.operation_kind = OperationKind.Conv2d
     self.conv_kind = conv_kind
@@ -36,6 +37,9 @@ class Conv2dOperation:
     self.need_load_from_const = need_load_from_const  
     self.implicit_gemm_mode = implicit_gemm_mode
     self.without_shared_load = without_shared_load
+    self.required_cuda_ver_major = required_cuda_ver_major
+    self.required_cuda_ver_minor = required_cuda_ver_minor
+
   #
   def accumulator_type(self):
     accum = self.tile_description.math_instruction.element_accumulator
@@ -320,7 +324,8 @@ using Deconvolution =
 
 #
 def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
-  skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False):
+  skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \
+  required_cuda_ver_minor = 2):
   operations = []
 
   element_epilogue = DataType.f32 
@@ -407,10 +412,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
       bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
       dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) 
 
-      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load)
+      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
       operations.append(new_operation)
       if not skip_unity_kernel:
-        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load)
+        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
         operations.append(new_operation)
   return operations
 
@@ -545,7 +550,7 @@ class EmitConvSingleKernelWrapper():
       self.convolution_name = "Deconvolution"
 
     self.header_template = """
-#if !MEGDNN_TEGRA_X1
+#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -589,14 +594,17 @@ void initialize_${operation_name}(Manifest &manifest) {
     else:
       self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name())
     self.kernel_file = open(self.kernel_path, "w")
-    self.kernel_file.write(self.header_template)
+    self.kernel_file.write(SubstituteTemplate(self.header_template, {
+      'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
+      'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
+    }))
     return self
 
   #
   def emit(self):
     self.kernel_file.write(SubstituteTemplate(self.instance_template, {
       'operation_instance': self.instance_emitter.emit(self.operation),
-      }))
+    }))
 
     # emit manifest helper
     manifest = SubstituteTemplate(self.manifest_template, {
diff --git a/dnn/scripts/cutlass_generator/gemm_operation.py b/dnn/scripts/cutlass_generator/gemm_operation.py
index a6f33b18..382d6aaa 100644
--- a/dnn/scripts/cutlass_generator/gemm_operation.py
+++ b/dnn/scripts/cutlass_generator/gemm_operation.py
@@ -23,7 +23,8 @@ from library import *
 class GemmOperation:
   #
   def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
-      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
 
     self.operation_kind = OperationKind.Gemm
     self.arch = arch
@@ -35,6 +36,9 @@ class GemmOperation:
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
     self.swizzling_functor = swizzling_functor
+    self.required_cuda_ver_major = required_cuda_ver_major
+    self.required_cuda_ver_minor = required_cuda_ver_minor
+
 
   #
   def is_complex(self):
@@ -161,7 +165,8 @@ class GemmOperation:
 #
 class GemvBatchedStridedOperation:
   #
-  def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C):
+  def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C, \
+      required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
 
     self.operation_kind = OperationKind.Gemm
     self.arch = arch
@@ -172,6 +177,8 @@ class GemvBatchedStridedOperation:
     self.A = A
     self.B = B
     self.C = C
+    self.required_cuda_ver_major = required_cuda_ver_major
+    self.required_cuda_ver_minor = required_cuda_ver_minor
 
   #
   def accumulator_type(self):
@@ -243,7 +250,7 @@ class GemvBatchedStridedOperation:
     return self.procedural_name()
 
 #
-def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32):
+def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32, required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
   operations = []
   swizzling_functor = SwizzlingFunctor.Identity1
 
@@ -261,20 +268,23 @@ def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a
     B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b]))
     C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c]))
     operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \
-                                element_epilogue, epilogue, swizzling_functor))
+                                element_epilogue, epilogue, swizzling_functor, \
+                                required_cuda_ver_major, required_cuda_ver_minor))
     operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \
-                                element_epilogue, epilogue, swizzling_functor))
+                                element_epilogue, epilogue, swizzling_functor, \
+                                required_cuda_ver_major, required_cuda_ver_minor))
   return operations
 
 def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \
-                  align_a = 32, align_b = 32, align_c = 32):
+                  align_a = 32, align_b = 32, align_c = 32, \
+                  required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
   element_a, element_b, element_c, element_epilogue = data_type
 
   A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a]))
   B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b]))
   C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c]))
   return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \
-                                     A, B, C)
+                                     A, B, C, required_cuda_ver_major, required_cuda_ver_minor)
 
 ###################################################################################################
 #
@@ -1025,7 +1035,7 @@ class EmitGemmSingleKernelWrapper:
     self.instance_emitter = instance_emitters[self.operation.gemm_kind]
 
     self.header_template = """
-#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)                 
+#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})                 
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -1065,7 +1075,10 @@ void initialize_${operation_name}(Manifest &manifest) {
   def __enter__(self):
     self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) 
     self.kernel_file = open(self.kernel_path, "w")
-    self.kernel_file.write(self.header_template)
+    self.kernel_file.write(SubstituteTemplate(self.header_template, {
+      'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
+      'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
+    }))
     return self
 
   #
@@ -1109,7 +1122,7 @@ template void megdnn::cuda::cutlass_wrapper::
     self.instance_emitter = EmitGemvBatchedStridedInstance()
 
     self.header_template = """
-#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -1136,7 +1149,9 @@ ${operation_instance}
     self.kernel_file = open(self.kernel_path, "w")
     self.kernel_file.write(SubstituteTemplate(self.header_template, {
       'wrapper_path': self.wrapper_path,
-      }))
+      'required_cuda_ver_major': str(self.operation.required_cuda_ver_major),
+      'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor),
+     }))
     return self
 
   #
diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py
index cd7f810d..5a4d3d35 100644
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -217,6 +217,9 @@ def GenerateConv2d_TensorOp_8816(args):
   min_cc = 75
   max_cc = 1024
 
+  cuda_major = 10
+  cuda_minor = 2
+
   for math_inst in math_instructions:
     for layout in layouts:
       for dst_type, dst_layout in zip(dst_types, dst_layouts):
@@ -234,7 +237,7 @@ def GenerateConv2d_TensorOp_8816(args):
           ] 
           operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False, ImplicitGemmMode.GemmTN, True) 
+                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) 
         else:
           assert dst_layout == LayoutType.TensorNC4HW4
           tile_descriptions = [
@@ -250,7 +253,7 @@ def GenerateConv2d_TensorOp_8816(args):
           ]
           operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False) 
+                                     False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) 
         
   return operations
 
@@ -281,6 +284,9 @@ def GenerateConv2d_TensorOp_8832(args):
   min_cc = 75
   max_cc = 1024
 
+  cuda_major = 10
+  cuda_minor = 2
+
   for math_inst in math_instructions:
     for layout in layouts:
       for dst_layout in dst_layouts:
@@ -293,7 +299,7 @@ def GenerateConv2d_TensorOp_8832(args):
         ] 
         operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False, ImplicitGemmMode.GemmTN, True)
+                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
 
   layouts_nhwc = [
     (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), 
@@ -316,12 +322,12 @@ def GenerateConv2d_TensorOp_8832(args):
           for tile in tile_descriptions:
             operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                       dst_layout, dst_type, min_cc, layout[2], layout[2], 32, 
-                                      False, ImplicitGemmMode.GemmTN, False)
+                                      False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
             if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
               dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
               operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                       dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, 
-                                      False, ImplicitGemmMode.GemmTN, True)
+                                      False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
 
   return operations
 
@@ -624,6 +630,8 @@ def GeneratesGemm_TensorOp_1688(args):
   alignment_constraints = [8, 4, 2,
                            #1
                            ]
+  cuda_major = 10
+  cuda_minor = 2
 
   operations = []
   for math_inst in math_instructions:
@@ -655,7 +663,9 @@ def GeneratesGemm_TensorOp_1688(args):
                                       min_cc,     \
                                       align * 16, \
                                       align * 16, \
-                                      align * 16)
+                                      align * 16, \
+                                      cuda_major, \
+                                      cuda_minor)
   return operations
 
 #
@@ -686,6 +696,8 @@ def GeneratesGemm_TensorOp_884(args):
   alignment_constraints = [8, 4, 2,
                            # 1
                            ]
+  cuda_major = 10
+  cuda_minor = 2
 
   operations = []
   for math_inst in math_instructions:
@@ -717,7 +729,9 @@ def GeneratesGemm_TensorOp_884(args):
                                       min_cc,     \
                                       align * 16, \
                                       align * 16, \
-                                      align * 16)
+                                      align * 16, \
+                                      cuda_major, \
+                                      cuda_minor)
  
   return operations
 
diff --git a/dnn/scripts/cutlass_generator/manifest.py b/dnn/scripts/cutlass_generator/manifest.py
index 88c9fc6d..33aafb8a 100644
--- a/dnn/scripts/cutlass_generator/manifest.py
+++ b/dnn/scripts/cutlass_generator/manifest.py
@@ -351,6 +351,13 @@ void initialize_all(Manifest &manifest) {
 ###################################################################################################
 
 def GenerateManifest(args, operations, output_dir):
+  assert isinstance(operations, list)
+  if len(operations) == 0:
+    return
+  op = operations[0]
+  required_cuda_ver_major = op.required_cuda_ver_major
+  required_cuda_ver_minor = op.required_cuda_ver_minor
+
   manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
   f = open(manifest_path, "w")
   f.write("""
@@ -358,7 +365,7 @@ def GenerateManifest(args, operations, output_dir):
  Generated by generator.py - Do not edit.
 */
 
-#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
+#if __CUDACC_VER_MAJOR__ > %s || (__CUDACC_VER_MAJOR__ == %s && __CUDACC_VER_MINOR__ >= %s)
 
 #include "cutlass/cutlass.h"
 #include "src/cuda/cutlass/library.h"
@@ -367,7 +374,8 @@ def GenerateManifest(args, operations, output_dir):
 namespace cutlass {
 namespace library {
 
-""")
+""" % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))
+
   for op in operations:
       f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())
 
diff --git a/dnn/src/cuda/cutlass/initialize_all.cu b/dnn/src/cuda/cutlass/initialize_all.cu
index e836dd76..89d32c5d 100644
--- a/dnn/src/cuda/cutlass/initialize_all.cu
+++ b/dnn/src/cuda/cutlass/initialize_all.cu
@@ -44,26 +44,34 @@ namespace cutlass {
 namespace library {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if ((__CUDACC_VER_MAJOR__ > 10) || \
+     (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
+#endif
 
 #if __CUDACC_VER_MAJOR__ > 9 || \
         (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 
 void initialize_all_gemm_simt_operations(Manifest& manifest);
+void initialize_all_conv2d_simt_operations(Manifest& manifest);
+void initialize_all_deconv_simt_operations(Manifest& manifest);
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
 void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
 void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
-void initialize_all_conv2d_simt_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
-void initialize_all_deconv_simt_operations(Manifest& manifest);
+#endif
 
 void initialize_all(Manifest& manifest) {
     initialize_all_gemm_simt_operations(manifest);
+    initialize_all_conv2d_simt_operations(manifest);
+    initialize_all_deconv_simt_operations(manifest);
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
     initialize_all_gemm_tensorop884_operations(manifest);
     initialize_all_gemm_tensorop1688_operations(manifest);
-    initialize_all_conv2d_simt_operations(manifest);
     initialize_all_conv2d_tensorop8816_operations(manifest);
     initialize_all_conv2d_tensorop8832_operations(manifest);
-    initialize_all_deconv_simt_operations(manifest);
+#endif
 }
 
 #else
diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp
index e3bb328d..ada31b74 100644
--- a/dnn/src/cuda/matrix_mul/algos.cpp
+++ b/dnn/src/cuda/matrix_mul/algos.cpp
@@ -43,6 +43,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
     for (auto&& algo : simt_float32_gemv_batched_strided) {
         all_algos.push_back(&algo);
     }
+#if CUDA_VERSION >= 10020
     for (auto&& algo : tensorop_float16) {
         all_algos.push_back(&algo);
     }
@@ -50,6 +51,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
         all_algos.push_back(&algo);
     }
 #endif
+#endif
     all_algos.push_back(&naive);
 
     for (auto&& algo : all_algos) {
@@ -107,7 +109,9 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
 #define cb(...)                                            \
     tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \
     tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__});
+#if CUDA_VERSION >= 10020
     FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb)
+#endif
 #undef cb
 #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES
 }
diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h
index 34c7cbf3..aa2807eb 100644
--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -241,6 +241,20 @@ public:
         return AlgoAttribute::REPRODUCIBLE;
     }
     MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT)
+    std::string param() const override {
+        std::string ret;
+        // FIXME: algo param compatible with old version, to avoid fastrun cache error
+        struct AlgoParam_ {
+            int threadblock_m, threadblock_n, threadblock_k;
+            int warp_m, warp_n, warp_k;
+        };
+        AlgoParam_ algo_param{
+                m_algo_param.threadblock_m, m_algo_param.threadblock_n,
+                m_algo_param.threadblock_k, m_algo_param.warp_m,
+                m_algo_param.warp_n,        m_algo_param.warp_k};
+        serialize_write_pod(algo_param, ret);
+        return ret;
+    }
 
 private:
     void do_exec(const ExecArgs& args) const override;
@@ -263,6 +277,21 @@ public:
                AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
     }
     MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K)
+    std::string param() const override {
+        std::string ret;
+        // FIXME: algo param compatible with old version, to avoid fastrun cache
+        // error
+        struct AlgoParam_ {
+            int threadblock_m, threadblock_n, threadblock_k;
+            int warp_m, warp_n, warp_k;
+        };
+        AlgoParam_ algo_param{
+                m_algo_param.threadblock_m, m_algo_param.threadblock_n,
+                m_algo_param.threadblock_k, m_algo_param.warp_m,
+                m_algo_param.warp_n,        m_algo_param.warp_k};
+        serialize_write_pod(algo_param, ret);
+        return ret;
+    }
 
 private:
     void do_exec(const ExecArgs& args) const override;
@@ -297,6 +326,7 @@ private:
     std::string m_name;
 };
 
+#if CUDA_VERSION >= 10020
 class MatrixMulForwardImpl::AlgoFloat16TensorOp final
         : public AlgoCutlassMatrixMulBase {
 public:
@@ -345,7 +375,7 @@ private:
     int min_alignment_requirement() const override { return 2; }
     std::string m_name;
 };
-
+#endif
 #endif
 
 class MatrixMulForwardImpl::AlgoPack : NonCopyableObj {
@@ -370,9 +400,11 @@ public:
     std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
     std::vector<AlgoFloat32SIMTGemvBatchedStrided>
             simt_float32_gemv_batched_strided;
+#if CUDA_VERSION >= 10020
     std::vector<AlgoFloat16TensorOp> tensorop_float16;
     std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k;
 #endif
+#endif
     std::vector<AlgoBase*> all_algos;
 
     const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
index ea2c05e9..1dcf6d84 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -15,7 +15,7 @@
 #include "src/cuda/matrix_mul/algos.h"
 #include "src/cuda/utils.h"
 
-#if CUDA_VERSION >= 9020
+#if CUDA_VERSION >= 10020
 using namespace megdnn;
 using namespace cuda;
 
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
index 18211251..c9b9adf5 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -15,14 +15,14 @@
 #include "src/cuda/matrix_mul/algos.h"
 #include "src/cuda/utils.h"
 
-#if CUDA_VERSION >= 9020
+#if CUDA_VERSION >= 10020
 using namespace megdnn;
 using namespace cuda;
 
 bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available(
         const SizeArgs& args) const {
     auto&& param = args.opr->param();
-    int n = args.layout_c.shape[1],
+    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
         k = args.layout_a.shape[param.transposeA ? 0 : 1];
     bool available =
             args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
@@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available(
     auto&& device_prop = cuda::current_device_prop();
     int y_grid_limit = device_prop.maxGridSize[1];
     // limit y grid
-    available &= ((n + m_algo_param.threadblock_n - 1) /
-                          m_algo_param.threadblock_n <=
+    available &= ((m + m_algo_param.threadblock_m - 1) /
+                          m_algo_param.threadblock_m <=
                   y_grid_limit);
     if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 &&
         m_algo_param.instruction_k == 4) {
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
index 63d9faef..7278202d 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
index 6d581a8e..d0b94e35 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
diff --git a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
index 10ef7f42..dfbc3a06 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -22,7 +22,7 @@ using namespace cuda;
 bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
         const SizeArgs& args) const {
     auto&& param = args.opr->param();
-    int n = args.layout_c.shape[1],
+    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
         k = args.layout_a.shape[param.transposeA ? 0 : 1];
     bool available =
             args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
@@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
     auto&& device_prop = cuda::current_device_prop();
     int y_grid_limit = device_prop.maxGridSize[1];
     // limit y grid
-    available &= ((n + m_algo_param.threadblock_n - 1) /
-                          m_algo_param.threadblock_n <=
+    available &= ((m + m_algo_param.threadblock_m - 1) /
+                          m_algo_param.threadblock_m <=
                   y_grid_limit);
     return available;
 }
diff --git a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
index 55a95d1f..5169e498 100644
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
@@ -2,7 +2,7 @@
  * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp
index b406fcf8..50e42e96 100644
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -21,7 +21,6 @@
 #include "test/cuda/fixture.h"
 #include "test/cuda/utils.h"
 
-#define MEGDNN_WITH_BENCHMARK 1
 #if CUDA_VERSION >= 9020
 namespace megdnn {
 namespace test {
@@ -373,6 +372,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 #undef cb
 #undef MEGDNN_FOREACH_CUTLASS_KERNEL
 
+#if CUDA_VERSION >= 10020
 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb)     \
     cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \
     cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \
@@ -448,6 +448,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 #undef cb
 
 #undef MEGDNN_FOREACH_CUTLASS_KERNEL
+#endif
 
 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) {
@@ -462,12 +463,14 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) {
                          "CUTLASS_FLOAT32_SIMT");
 }
 
+#if CUDA_VERSION >= 10020
 TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) {
     benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(),
                          dtype::Float16(), dtype::Float16(), dtype::Float16(),
                          "CUTLASS_FLOAT16_TENSOR_OP");
 }
 #endif
+#endif
 }  // namespace test
 }  // namespace megdnn
 #endif