From e0d505e6bd25712961ba84a65a64bf139bd40e36 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 12 Jul 2022 16:53:12 +0800 Subject: [PATCH] fix(mgb/dnn): fix bug that some cutlass file compile very slowly on SM86 GitOrigin-RevId: 91d7ac1927b93cc1dc68c7b97284b7f41ecf0b80 --- dnn/scripts/cutlass_generator/gen_list.py | 36 ++-- dnn/scripts/cutlass_generator/generator.py | 297 ++++++++++++++++++++--------- dnn/scripts/cutlass_generator/list.bzl | 60 +++++- 3 files changed, 275 insertions(+), 118 deletions(-) diff --git a/dnn/scripts/cutlass_generator/gen_list.py b/dnn/scripts/cutlass_generator/gen_list.py index 7bf472aa..3681a1e2 100644 --- a/dnn/scripts/cutlass_generator/gen_list.py +++ b/dnn/scripts/cutlass_generator/gen_list.py @@ -36,9 +36,9 @@ def write_op_list(f, gen_op, gen_type): f.write(' "all_%s_%s_operations.cu",\n' % (gen_op, gen_type)) # Write down a list of merged filenames -def write_merge_file_name(f, gen_op, gen_type): - f.write(' "{}_{}_1.cu",\n'.format(gen_op,gen_type)) - f.write(' "{}_{}_2.cu",\n'.format(gen_op,gen_type)) +def write_merge_file_name(f, gen_op, gen_type, split_number): + for i in range(0, split_number): + f.write(' "{}_{}_{}.cu",\n'.format(gen_op,gen_type,i)) if gen_op != "gemv": f.write(' "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type)) @@ -47,19 +47,19 @@ if __name__ == "__main__": f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") f.write("cutlass_gen_list = [\n") - write_merge_file_name(f, "gemm", "simt") - write_merge_file_name(f, "gemm", "tensorop1688") - write_merge_file_name(f, "gemm", "tensorop884") - write_merge_file_name(f, "gemv", "simt") - write_merge_file_name(f, "deconv", "simt") - write_merge_file_name(f, "deconv", "tensorop8816") - write_merge_file_name(f, "conv2d", "simt") - write_merge_file_name(f, "conv2d", "tensorop8816") - write_merge_file_name(f, "conv2d", "tensorop8832") - write_merge_file_name(f, "dwconv2d_fprop", "simt") - write_merge_file_name(f, "dwconv2d_fprop", "tensorop884") - write_merge_file_name(f, "dwconv2d_dgrad", "simt") - write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884") - write_merge_file_name(f, "dwconv2d_wgrad", "simt") - write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884") + write_merge_file_name(f, "gemm", "simt", 2) + write_merge_file_name(f, "gemm", "tensorop884", 30) + write_merge_file_name(f, "gemm", "tensorop1688", 2) + write_merge_file_name(f, "gemv", "simt", 2) + write_merge_file_name(f, "deconv", "simt", 2) + write_merge_file_name(f, "deconv", "tensorop8816", 4) + write_merge_file_name(f, "conv2d", "simt", 2) + write_merge_file_name(f, "conv2d", "tensorop8816", 4) + write_merge_file_name(f, "conv2d", "tensorop8832", 4) + write_merge_file_name(f, "dwconv2d_fprop", "simt", 2) + write_merge_file_name(f, "dwconv2d_fprop", "tensorop884", 4) + write_merge_file_name(f, "dwconv2d_dgrad", "simt", 2) + write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4) + write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2) + write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4) f.write("]") diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py index 72884e72..1d1116e9 100644 --- a/dnn/scripts/cutlass_generator/generator.py +++ b/dnn/scripts/cutlass_generator/generator.py @@ -1656,108 +1656,219 @@ def GenerateGemvOperations(args): ) return GenerateGemv_Simt(args) - -def concat_file(file_path:str,file_name_first:str,file_name_last:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None): +################################################################################ +# parameters +# split_number - the concated file will be divided into split_number parts +# file_path - the path of file, which is need to be concated +# operations - args.operations +# type - args.type +# head - the head in the file +# required_cuda_ver_major - required cuda major +# required_cuda_ver_minor - required cuda minjor +# epilogue - the epilogue in the file +# wrapper_path - wrapper path +################################################################################ +def ConcatFile(split_number:int, file_path:str,operations:str,type:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None): import os meragefiledir = file_path - filenames=os.listdir(meragefiledir) - file1=open(file_path + '/{}_{}_1.cu'.format(file_name_first,file_name_last),'w') - file2=open(file_path + '/{}_{}_2.cu'.format(file_name_first,file_name_last),'w') - if wrapper_path is None: - file1.write( - SubstituteTemplate( - head, - { - "required_cuda_ver_major": str( - required_cuda_ver_major - ), - "required_cuda_ver_minor": str( - required_cuda_ver_minor - ), - }, - ) - ) - file2.write( - SubstituteTemplate( - head, - { - "required_cuda_ver_major": str( - required_cuda_ver_major - ), - "required_cuda_ver_minor": str( - required_cuda_ver_minor - ), - }, - ) - ) - else: - file1.write( - SubstituteTemplate( - head, - { - "wrapper_path": wrapper_path, - "required_cuda_ver_major": str( - required_cuda_ver_major - ), - "required_cuda_ver_minor": str( - required_cuda_ver_minor - ), - }, - ) - ) - file2.write( - SubstituteTemplate( - head, - { - "wrapper_path": wrapper_path, - "required_cuda_ver_major": str( - required_cuda_ver_major - ), - "required_cuda_ver_minor": str( - required_cuda_ver_minor - ), - }, - ) - ) - flag = 0 - if "tensorop" in file_name_last: + filenames=os.listdir(meragefiledir) + # filter file + if "tensorop" in type: sub_string_1 = "tensorop" - sub_string_2 = file_name_last[8:] + sub_string_2 = type[8:] else: sub_string_1 = sub_string_2 = "simt" - if "dwconv2d_" in file_name_first: - file_name_first = file_name_first[:2]+file_name_first[9:] - elif ("conv2d" in file_name_first) or ("deconv" in file_name_first): - file_name_first = "cutlass" + if "dwconv2d_" in operations: + filtered_operations = operations[:2]+operations[9:] + elif ("conv2d" in operations) or ("deconv" in operations): + filtered_operations = "cutlass" + else: + filtered_operations = operations + #get the file list number + file_list = {} + file_list[operations + type] = 0 + for filename in filenames: + if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename): + file_list[operations + type] += 1 + #concat file for linux + flag_1 = 0 + flag_2 = 0 for filename in filenames: - if (file_name_first in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename): - flag += 1 + if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename): + flag_1 += 1 filepath=meragefiledir+'/'+filename - if flag <= len(filenames)/2: + if (flag_1 >= flag_2 * (file_list[operations + type]/split_number)) and (flag_1 <= (flag_2 + 1) * (file_list[operations + type]/split_number)): + file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a') + #write Template at the head + if wrapper_path is None: + file.write( + SubstituteTemplate( + head, + { + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + else: + file.write( + SubstituteTemplate( + head, + { + "wrapper_path": wrapper_path, + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + # concat all the remaining files + if flag_2 == (split_number - 1): + for line in open(filepath): + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) + continue for line in open(filepath): - file1.writelines(line) + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) else: + #write Template at the head + if wrapper_path is None: + file.write( + SubstituteTemplate( + head, + { + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + else: + file.write( + SubstituteTemplate( + head, + { + "wrapper_path": wrapper_path, + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) for line in open(filepath): - file2.writelines(line) - os.remove(filepath) - file1.write('\n') - file2.write('\n') + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) + file.close() + flag_2 += 1 + + + #concat file for windows elif filename[0].isdigit() and ("all_" not in filename): - flag += 1 + flag_1 += 1 filepath=meragefiledir+'/'+filename - if flag <= len(filenames)/2: + if (flag_1 >= flag_2 * (len(filenames)/split_number)) and (flag_1 <= (flag_2 + 1) * (len(filenames)/split_number)): + file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a') + #write Template at the head + if wrapper_path is None: + file.write( + SubstituteTemplate( + head, + { + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + else: + file.write( + SubstituteTemplate( + head, + { + "wrapper_path": wrapper_path, + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + # concat all the remaining files + if flag_2 == (split_number - 1): + for line in open(filepath): + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) + continue for line in open(filepath): - file1.writelines(line) + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) else: + #write Template at the head + if wrapper_path is None: + file.write( + SubstituteTemplate( + head, + { + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) + else: + file.write( + SubstituteTemplate( + head, + { + "wrapper_path": wrapper_path, + "required_cuda_ver_major": str( + required_cuda_ver_major + ), + "required_cuda_ver_minor": str( + required_cuda_ver_minor + ), + }, + ) + ) for line in open(filepath): - file2.writelines(line) - os.remove(filepath) - file1.write('\n') - file2.write('\n') - file1.write(epilogue) - file2.write(epilogue) - file1.close() - file2.close() + file.writelines(line) + os.remove(filepath) + file.write('\n') + file.write(epilogue) + file.close() + flag_2 += 1 ################################################################################################### ################################################################################################### @@ -1833,7 +1944,10 @@ if __name__ == "__main__": required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_minor = operations[0].required_cuda_ver_minor epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template - concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) + if "tensorop" in args.type: + ConcatFile(4, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) + else: + ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) elif args.operations == "gemm": for operation in operations: with EmitGemmSingleKernelWrapper( @@ -1844,7 +1958,10 @@ if __name__ == "__main__": required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_minor = operations[0].required_cuda_ver_minor epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template - concat_file(args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) + if args.type == "tensorop884": + ConcatFile(30, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) + else: + ConcatFile(2, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) elif args.operations == "gemv": for operation in operations: with EmitGemvSingleKernelWrapper( @@ -1855,7 +1972,7 @@ if __name__ == "__main__": required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_minor = operations[0].required_cuda_ver_minor epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template - concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path) + ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path) if args.operations != "gemv": GenerateManifest(args, operations, args.output) diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl index d05f8d1d..04e92260 100644 --- a/dnn/scripts/cutlass_generator/list.bzl +++ b/dnn/scripts/cutlass_generator/list.bzl @@ -1,48 +1,88 @@ # Generated by dnn/scripts/cutlass_generator/gen_list.py cutlass_gen_list = [ + "gemm_simt_0.cu", "gemm_simt_1.cu", - "gemm_simt_2.cu", "all_gemm_simt_operations.cu", - "gemm_tensorop1688_1.cu", - "gemm_tensorop1688_2.cu", - "all_gemm_tensorop1688_operations.cu", + "gemm_tensorop884_0.cu", "gemm_tensorop884_1.cu", "gemm_tensorop884_2.cu", + "gemm_tensorop884_3.cu", + "gemm_tensorop884_4.cu", + "gemm_tensorop884_5.cu", + "gemm_tensorop884_6.cu", + "gemm_tensorop884_7.cu", + "gemm_tensorop884_8.cu", + "gemm_tensorop884_9.cu", + "gemm_tensorop884_10.cu", + "gemm_tensorop884_11.cu", + "gemm_tensorop884_12.cu", + "gemm_tensorop884_13.cu", + "gemm_tensorop884_14.cu", + "gemm_tensorop884_15.cu", + "gemm_tensorop884_16.cu", + "gemm_tensorop884_17.cu", + "gemm_tensorop884_18.cu", + "gemm_tensorop884_19.cu", + "gemm_tensorop884_20.cu", + "gemm_tensorop884_21.cu", + "gemm_tensorop884_22.cu", + "gemm_tensorop884_23.cu", + "gemm_tensorop884_24.cu", + "gemm_tensorop884_25.cu", + "gemm_tensorop884_26.cu", + "gemm_tensorop884_27.cu", + "gemm_tensorop884_28.cu", + "gemm_tensorop884_29.cu", "all_gemm_tensorop884_operations.cu", + "gemm_tensorop1688_0.cu", + "gemm_tensorop1688_1.cu", + "all_gemm_tensorop1688_operations.cu", + "gemv_simt_0.cu", "gemv_simt_1.cu", - "gemv_simt_2.cu", + "deconv_simt_0.cu", "deconv_simt_1.cu", - "deconv_simt_2.cu", "all_deconv_simt_operations.cu", + "deconv_tensorop8816_0.cu", "deconv_tensorop8816_1.cu", "deconv_tensorop8816_2.cu", + "deconv_tensorop8816_3.cu", "all_deconv_tensorop8816_operations.cu", + "conv2d_simt_0.cu", "conv2d_simt_1.cu", - "conv2d_simt_2.cu", "all_conv2d_simt_operations.cu", + "conv2d_tensorop8816_0.cu", "conv2d_tensorop8816_1.cu", "conv2d_tensorop8816_2.cu", + "conv2d_tensorop8816_3.cu", "all_conv2d_tensorop8816_operations.cu", + "conv2d_tensorop8832_0.cu", "conv2d_tensorop8832_1.cu", "conv2d_tensorop8832_2.cu", + "conv2d_tensorop8832_3.cu", "all_conv2d_tensorop8832_operations.cu", + "dwconv2d_fprop_simt_0.cu", "dwconv2d_fprop_simt_1.cu", - "dwconv2d_fprop_simt_2.cu", "all_dwconv2d_fprop_simt_operations.cu", + "dwconv2d_fprop_tensorop884_0.cu", "dwconv2d_fprop_tensorop884_1.cu", "dwconv2d_fprop_tensorop884_2.cu", + "dwconv2d_fprop_tensorop884_3.cu", "all_dwconv2d_fprop_tensorop884_operations.cu", + "dwconv2d_dgrad_simt_0.cu", "dwconv2d_dgrad_simt_1.cu", - "dwconv2d_dgrad_simt_2.cu", "all_dwconv2d_dgrad_simt_operations.cu", + "dwconv2d_dgrad_tensorop884_0.cu", "dwconv2d_dgrad_tensorop884_1.cu", "dwconv2d_dgrad_tensorop884_2.cu", + "dwconv2d_dgrad_tensorop884_3.cu", "all_dwconv2d_dgrad_tensorop884_operations.cu", + "dwconv2d_wgrad_simt_0.cu", "dwconv2d_wgrad_simt_1.cu", - "dwconv2d_wgrad_simt_2.cu", "all_dwconv2d_wgrad_simt_operations.cu", + "dwconv2d_wgrad_tensorop884_0.cu", "dwconv2d_wgrad_tensorop884_1.cu", "dwconv2d_wgrad_tensorop884_2.cu", + "dwconv2d_wgrad_tensorop884_3.cu", "all_dwconv2d_wgrad_tensorop884_operations.cu", ] \ No newline at end of file