From e0d505e6bd25712961ba84a65a64bf139bd40e36 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Tue, 12 Jul 2022 16:53:12 +0800
Subject: [PATCH] fix(mgb/dnn): fix bug that some cutlass file compile very
 slowly on SM86

GitOrigin-RevId: 91d7ac1927b93cc1dc68c7b97284b7f41ecf0b80
---
 dnn/scripts/cutlass_generator/gen_list.py  |  36 ++--
 dnn/scripts/cutlass_generator/generator.py | 297 ++++++++++++++++++++---------
 dnn/scripts/cutlass_generator/list.bzl     |  60 +++++-
 3 files changed, 275 insertions(+), 118 deletions(-)

diff --git a/dnn/scripts/cutlass_generator/gen_list.py b/dnn/scripts/cutlass_generator/gen_list.py
index 7bf472aa..3681a1e2 100644
--- a/dnn/scripts/cutlass_generator/gen_list.py
+++ b/dnn/scripts/cutlass_generator/gen_list.py
@@ -36,9 +36,9 @@ def write_op_list(f, gen_op, gen_type):
         f.write('    "all_%s_%s_operations.cu",\n' % (gen_op, gen_type))
 
 # Write down a list of merged filenames
-def write_merge_file_name(f, gen_op, gen_type):
-    f.write('    "{}_{}_1.cu",\n'.format(gen_op,gen_type))
-    f.write('    "{}_{}_2.cu",\n'.format(gen_op,gen_type))
+def write_merge_file_name(f, gen_op, gen_type, split_number):
+    for i in range(0, split_number):
+        f.write('    "{}_{}_{}.cu",\n'.format(gen_op,gen_type,i))
     if gen_op != "gemv":
         f.write('    "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type))
 
@@ -47,19 +47,19 @@ if __name__ == "__main__":
         f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n")
         f.write("cutlass_gen_list = [\n")
 
-        write_merge_file_name(f, "gemm", "simt")
-        write_merge_file_name(f, "gemm", "tensorop1688")
-        write_merge_file_name(f, "gemm", "tensorop884")
-        write_merge_file_name(f, "gemv", "simt")
-        write_merge_file_name(f, "deconv", "simt")
-        write_merge_file_name(f, "deconv", "tensorop8816")
-        write_merge_file_name(f, "conv2d", "simt")
-        write_merge_file_name(f, "conv2d", "tensorop8816")
-        write_merge_file_name(f, "conv2d", "tensorop8832")
-        write_merge_file_name(f, "dwconv2d_fprop", "simt")
-        write_merge_file_name(f, "dwconv2d_fprop", "tensorop884")
-        write_merge_file_name(f, "dwconv2d_dgrad", "simt")
-        write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884")
-        write_merge_file_name(f, "dwconv2d_wgrad", "simt")
-        write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884")
+        write_merge_file_name(f, "gemm", "simt", 2)
+        write_merge_file_name(f, "gemm", "tensorop884", 30)
+        write_merge_file_name(f, "gemm", "tensorop1688", 2)
+        write_merge_file_name(f, "gemv", "simt", 2)
+        write_merge_file_name(f, "deconv", "simt", 2)
+        write_merge_file_name(f, "deconv", "tensorop8816", 4)
+        write_merge_file_name(f, "conv2d", "simt", 2)
+        write_merge_file_name(f, "conv2d", "tensorop8816", 4)
+        write_merge_file_name(f, "conv2d", "tensorop8832", 4)
+        write_merge_file_name(f, "dwconv2d_fprop", "simt", 2)
+        write_merge_file_name(f, "dwconv2d_fprop", "tensorop884", 4)
+        write_merge_file_name(f, "dwconv2d_dgrad", "simt", 2)
+        write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4)
+        write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2)
+        write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4)
         f.write("]")
diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py
index 72884e72..1d1116e9 100644
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -1656,108 +1656,219 @@ def GenerateGemvOperations(args):
     )
     return GenerateGemv_Simt(args)
 
-
-def concat_file(file_path:str,file_name_first:str,file_name_last:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None):
+################################################################################
+# parameters
+# split_number - the concated file will be divided into split_number parts
+# file_path - the path of file, which is need to be concated
+# operations - args.operations
+# type - args.type
+# head - the head in the file
+# required_cuda_ver_major - required cuda major
+# required_cuda_ver_minor - required cuda minjor
+# epilogue - the epilogue in the file
+# wrapper_path - wrapper path
+################################################################################
+def ConcatFile(split_number:int, file_path:str,operations:str,type:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None):
     import os
     meragefiledir = file_path
-    filenames=os.listdir(meragefiledir)  
-    file1=open(file_path + '/{}_{}_1.cu'.format(file_name_first,file_name_last),'w')
-    file2=open(file_path + '/{}_{}_2.cu'.format(file_name_first,file_name_last),'w')
-    if wrapper_path is None:
-        file1.write(
-            SubstituteTemplate(
-                head,
-                {
-                    "required_cuda_ver_major": str(
-                        required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
-        file2.write(
-            SubstituteTemplate(
-                head,
-                {
-                    "required_cuda_ver_major": str(
-                        required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
-    else:
-        file1.write(
-                SubstituteTemplate(
-                    head,
-                    {
-                        "wrapper_path": wrapper_path,
-                        "required_cuda_ver_major": str(
-                            required_cuda_ver_major
-                        ),
-                        "required_cuda_ver_minor": str(
-                            required_cuda_ver_minor
-                        ),
-                    },
-                )
-            )
-        file2.write(
-            SubstituteTemplate(
-                head,
-                {
-                    "wrapper_path": wrapper_path,
-                    "required_cuda_ver_major": str(
-                        required_cuda_ver_major
-                    ),
-                    "required_cuda_ver_minor": str(
-                        required_cuda_ver_minor
-                    ),
-                },
-            )
-        )
-    flag = 0
-    if "tensorop" in file_name_last:
+    filenames=os.listdir(meragefiledir)
+    # filter file
+    if "tensorop" in type:
         sub_string_1 = "tensorop"
-        sub_string_2 = file_name_last[8:]
+        sub_string_2 = type[8:]
     else:
         sub_string_1 = sub_string_2 = "simt"
-    if "dwconv2d_" in file_name_first:
-        file_name_first = file_name_first[:2]+file_name_first[9:]
-    elif ("conv2d" in file_name_first) or ("deconv" in file_name_first):
-        file_name_first = "cutlass"
+    if "dwconv2d_" in operations:
+        filtered_operations = operations[:2]+operations[9:]
+    elif ("conv2d" in operations) or ("deconv" in operations):
+        filtered_operations = "cutlass"
+    else:
+        filtered_operations = operations
+    #get the file list number
+    file_list = {}
+    file_list[operations + type] = 0
+    for filename in filenames:
+        if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
+            file_list[operations + type] += 1
+    #concat file for linux
+    flag_1 = 0
+    flag_2 = 0
     for filename in filenames:
-        if (file_name_first in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
-            flag += 1
+        if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
+            flag_1 += 1
             filepath=meragefiledir+'/'+filename
-            if flag <= len(filenames)/2:
+            if (flag_1 >= flag_2 * (file_list[operations + type]/split_number)) and (flag_1 <= (flag_2 + 1) * (file_list[operations + type]/split_number)):
+                file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a')
+                #write Template at the head
+                if wrapper_path is None:
+                    file.write(
+                        SubstituteTemplate(
+                            head,
+                            {
+                                "required_cuda_ver_major": str(
+                                    required_cuda_ver_major
+                                ),
+                                "required_cuda_ver_minor": str(
+                                    required_cuda_ver_minor
+                                ),
+                            },
+                        )
+                    )
+                else:
+                    file.write(
+                            SubstituteTemplate(
+                                head,
+                                {
+                                    "wrapper_path": wrapper_path,
+                                    "required_cuda_ver_major": str(
+                                        required_cuda_ver_major
+                                    ),
+                                    "required_cuda_ver_minor": str(
+                                        required_cuda_ver_minor
+                                    ),
+                                },
+                            )
+                        )
+                # concat all the remaining files
+                if flag_2 == (split_number - 1):
+                    for line in open(filepath):
+                        file.writelines(line)
+                    os.remove(filepath)
+                    file.write('\n')
+                    file.write(epilogue)
+                    continue
                 for line in open(filepath):
-                    file1.writelines(line)
+                    file.writelines(line)
+                os.remove(filepath)
+                file.write('\n')
+                file.write(epilogue)
             else:
+                #write Template at the head
+                if wrapper_path is None:
+                    file.write(
+                        SubstituteTemplate(
+                            head,
+                            {
+                                "required_cuda_ver_major": str(
+                                    required_cuda_ver_major
+                                ),
+                                "required_cuda_ver_minor": str(
+                                    required_cuda_ver_minor
+                                ),
+                            },
+                        )
+                    )
+                else:
+                    file.write(
+                            SubstituteTemplate(
+                                head,
+                                {
+                                    "wrapper_path": wrapper_path,
+                                    "required_cuda_ver_major": str(
+                                        required_cuda_ver_major
+                                    ),
+                                    "required_cuda_ver_minor": str(
+                                        required_cuda_ver_minor
+                                    ),
+                                },
+                            )
+                        )
                 for line in open(filepath):
-                    file2.writelines(line)
-            os.remove(filepath)
-            file1.write('\n')
-            file2.write('\n')
+                    file.writelines(line)
+                os.remove(filepath)
+                file.write('\n')
+                file.write(epilogue)
+                file.close()
+                flag_2 += 1
+
+
+        #concat file for windows
         elif filename[0].isdigit() and ("all_" not in filename):
-            flag += 1
+            flag_1 += 1
             filepath=meragefiledir+'/'+filename
-            if flag <= len(filenames)/2:
+            if (flag_1 >= flag_2 * (len(filenames)/split_number)) and (flag_1 <= (flag_2 + 1) * (len(filenames)/split_number)):
+                file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a')
+                #write Template at the head
+                if wrapper_path is None:
+                    file.write(
+                        SubstituteTemplate(
+                            head,
+                            {
+                                "required_cuda_ver_major": str(
+                                    required_cuda_ver_major
+                                ),
+                                "required_cuda_ver_minor": str(
+                                    required_cuda_ver_minor
+                                ),
+                            },
+                        )
+                    )
+                else:
+                    file.write(
+                            SubstituteTemplate(
+                                head,
+                                {
+                                    "wrapper_path": wrapper_path,
+                                    "required_cuda_ver_major": str(
+                                        required_cuda_ver_major
+                                    ),
+                                    "required_cuda_ver_minor": str(
+                                        required_cuda_ver_minor
+                                    ),
+                                },
+                            )
+                        )
+                # concat all the remaining files
+                if flag_2 == (split_number - 1):
+                    for line in open(filepath):
+                        file.writelines(line)
+                    os.remove(filepath)
+                    file.write('\n')
+                    file.write(epilogue)
+                    continue
                 for line in open(filepath):
-                    file1.writelines(line)
+                    file.writelines(line)
+                os.remove(filepath)
+                file.write('\n')
+                file.write(epilogue)
             else:
+                #write Template at the head
+                if wrapper_path is None:
+                    file.write(
+                        SubstituteTemplate(
+                            head,
+                            {
+                                "required_cuda_ver_major": str(
+                                    required_cuda_ver_major
+                                ),
+                                "required_cuda_ver_minor": str(
+                                    required_cuda_ver_minor
+                                ),
+                            },
+                        )
+                    )
+                else:
+                    file.write(
+                            SubstituteTemplate(
+                                head,
+                                {
+                                    "wrapper_path": wrapper_path,
+                                    "required_cuda_ver_major": str(
+                                        required_cuda_ver_major
+                                    ),
+                                    "required_cuda_ver_minor": str(
+                                        required_cuda_ver_minor
+                                    ),
+                                },
+                            )
+                        )
                 for line in open(filepath):
-                    file2.writelines(line)
-            os.remove(filepath)
-            file1.write('\n')
-            file2.write('\n')
-    file1.write(epilogue)
-    file2.write(epilogue)
-    file1.close()
-    file2.close()
+                    file.writelines(line)
+                os.remove(filepath)
+                file.write('\n')
+                file.write(epilogue)
+                file.close()
+                flag_2 += 1
 
 ###################################################################################################
 ###################################################################################################
@@ -1833,7 +1944,10 @@ if __name__ == "__main__":
         required_cuda_ver_major = operations[0].required_cuda_ver_major
         required_cuda_ver_minor = operations[0].required_cuda_ver_minor
         epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
-        concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
+        if "tensorop" in args.type:
+            ConcatFile(4, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
+        else:
+            ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
     elif args.operations == "gemm":
         for operation in operations:
             with EmitGemmSingleKernelWrapper(
@@ -1844,7 +1958,10 @@ if __name__ == "__main__":
         required_cuda_ver_major = operations[0].required_cuda_ver_major
         required_cuda_ver_minor = operations[0].required_cuda_ver_minor
         epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
-        concat_file(args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
+        if args.type == "tensorop884":
+            ConcatFile(30, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
+        else:
+            ConcatFile(2, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
     elif args.operations == "gemv":
         for operation in operations:
             with EmitGemvSingleKernelWrapper(
@@ -1855,7 +1972,7 @@ if __name__ == "__main__":
         required_cuda_ver_major = operations[0].required_cuda_ver_major
         required_cuda_ver_minor = operations[0].required_cuda_ver_minor
         epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template
-        concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path)
+        ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path)
 
     if args.operations != "gemv":
         GenerateManifest(args, operations, args.output)
diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl
index d05f8d1d..04e92260 100644
--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
@@ -1,48 +1,88 @@
 # Generated by dnn/scripts/cutlass_generator/gen_list.py
 
 cutlass_gen_list = [
+    "gemm_simt_0.cu",
     "gemm_simt_1.cu",
-    "gemm_simt_2.cu",
     "all_gemm_simt_operations.cu",
-    "gemm_tensorop1688_1.cu",
-    "gemm_tensorop1688_2.cu",
-    "all_gemm_tensorop1688_operations.cu",
+    "gemm_tensorop884_0.cu",
     "gemm_tensorop884_1.cu",
     "gemm_tensorop884_2.cu",
+    "gemm_tensorop884_3.cu",
+    "gemm_tensorop884_4.cu",
+    "gemm_tensorop884_5.cu",
+    "gemm_tensorop884_6.cu",
+    "gemm_tensorop884_7.cu",
+    "gemm_tensorop884_8.cu",
+    "gemm_tensorop884_9.cu",
+    "gemm_tensorop884_10.cu",
+    "gemm_tensorop884_11.cu",
+    "gemm_tensorop884_12.cu",
+    "gemm_tensorop884_13.cu",
+    "gemm_tensorop884_14.cu",
+    "gemm_tensorop884_15.cu",
+    "gemm_tensorop884_16.cu",
+    "gemm_tensorop884_17.cu",
+    "gemm_tensorop884_18.cu",
+    "gemm_tensorop884_19.cu",
+    "gemm_tensorop884_20.cu",
+    "gemm_tensorop884_21.cu",
+    "gemm_tensorop884_22.cu",
+    "gemm_tensorop884_23.cu",
+    "gemm_tensorop884_24.cu",
+    "gemm_tensorop884_25.cu",
+    "gemm_tensorop884_26.cu",
+    "gemm_tensorop884_27.cu",
+    "gemm_tensorop884_28.cu",
+    "gemm_tensorop884_29.cu",
     "all_gemm_tensorop884_operations.cu",
+    "gemm_tensorop1688_0.cu",
+    "gemm_tensorop1688_1.cu",
+    "all_gemm_tensorop1688_operations.cu",
+    "gemv_simt_0.cu",
     "gemv_simt_1.cu",
-    "gemv_simt_2.cu",
+    "deconv_simt_0.cu",
     "deconv_simt_1.cu",
-    "deconv_simt_2.cu",
     "all_deconv_simt_operations.cu",
+    "deconv_tensorop8816_0.cu",
     "deconv_tensorop8816_1.cu",
     "deconv_tensorop8816_2.cu",
+    "deconv_tensorop8816_3.cu",
     "all_deconv_tensorop8816_operations.cu",
+    "conv2d_simt_0.cu",
     "conv2d_simt_1.cu",
-    "conv2d_simt_2.cu",
     "all_conv2d_simt_operations.cu",
+    "conv2d_tensorop8816_0.cu",
     "conv2d_tensorop8816_1.cu",
     "conv2d_tensorop8816_2.cu",
+    "conv2d_tensorop8816_3.cu",
     "all_conv2d_tensorop8816_operations.cu",
+    "conv2d_tensorop8832_0.cu",
     "conv2d_tensorop8832_1.cu",
     "conv2d_tensorop8832_2.cu",
+    "conv2d_tensorop8832_3.cu",
     "all_conv2d_tensorop8832_operations.cu",
+    "dwconv2d_fprop_simt_0.cu",
     "dwconv2d_fprop_simt_1.cu",
-    "dwconv2d_fprop_simt_2.cu",
     "all_dwconv2d_fprop_simt_operations.cu",
+    "dwconv2d_fprop_tensorop884_0.cu",
     "dwconv2d_fprop_tensorop884_1.cu",
     "dwconv2d_fprop_tensorop884_2.cu",
+    "dwconv2d_fprop_tensorop884_3.cu",
     "all_dwconv2d_fprop_tensorop884_operations.cu",
+    "dwconv2d_dgrad_simt_0.cu",
     "dwconv2d_dgrad_simt_1.cu",
-    "dwconv2d_dgrad_simt_2.cu",
     "all_dwconv2d_dgrad_simt_operations.cu",
+    "dwconv2d_dgrad_tensorop884_0.cu",
     "dwconv2d_dgrad_tensorop884_1.cu",
     "dwconv2d_dgrad_tensorop884_2.cu",
+    "dwconv2d_dgrad_tensorop884_3.cu",
     "all_dwconv2d_dgrad_tensorop884_operations.cu",
+    "dwconv2d_wgrad_simt_0.cu",
     "dwconv2d_wgrad_simt_1.cu",
-    "dwconv2d_wgrad_simt_2.cu",
     "all_dwconv2d_wgrad_simt_operations.cu",
+    "dwconv2d_wgrad_tensorop884_0.cu",
     "dwconv2d_wgrad_tensorop884_1.cu",
     "dwconv2d_wgrad_tensorop884_2.cu",
+    "dwconv2d_wgrad_tensorop884_3.cu",
     "all_dwconv2d_wgrad_tensorop884_operations.cu",
 ]
\ No newline at end of file