Browse Source

fix(mgb/dnn): fix bug that some cutlass file compile very slowly on SM86

GitOrigin-RevId: 91d7ac1927
HuaHua404-patch-4
Megvii Engine Team 2 years ago
parent
commit
e0d505e6bd
3 changed files with 275 additions and 118 deletions
  1. +18
    -18
      dnn/scripts/cutlass_generator/gen_list.py
  2. +207
    -90
      dnn/scripts/cutlass_generator/generator.py
  3. +50
    -10
      dnn/scripts/cutlass_generator/list.bzl

+ 18
- 18
dnn/scripts/cutlass_generator/gen_list.py View File

@@ -36,9 +36,9 @@ def write_op_list(f, gen_op, gen_type):
f.write(' "all_%s_%s_operations.cu",\n' % (gen_op, gen_type)) f.write(' "all_%s_%s_operations.cu",\n' % (gen_op, gen_type))


# Write down a list of merged filenames # Write down a list of merged filenames
def write_merge_file_name(f, gen_op, gen_type):
f.write(' "{}_{}_1.cu",\n'.format(gen_op,gen_type))
f.write(' "{}_{}_2.cu",\n'.format(gen_op,gen_type))
def write_merge_file_name(f, gen_op, gen_type, split_number):
for i in range(0, split_number):
f.write(' "{}_{}_{}.cu",\n'.format(gen_op,gen_type,i))
if gen_op != "gemv": if gen_op != "gemv":
f.write(' "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type)) f.write(' "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type))


@@ -47,19 +47,19 @@ if __name__ == "__main__":
f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n")
f.write("cutlass_gen_list = [\n") f.write("cutlass_gen_list = [\n")


write_merge_file_name(f, "gemm", "simt")
write_merge_file_name(f, "gemm", "tensorop1688")
write_merge_file_name(f, "gemm", "tensorop884")
write_merge_file_name(f, "gemv", "simt")
write_merge_file_name(f, "deconv", "simt")
write_merge_file_name(f, "deconv", "tensorop8816")
write_merge_file_name(f, "conv2d", "simt")
write_merge_file_name(f, "conv2d", "tensorop8816")
write_merge_file_name(f, "conv2d", "tensorop8832")
write_merge_file_name(f, "dwconv2d_fprop", "simt")
write_merge_file_name(f, "dwconv2d_fprop", "tensorop884")
write_merge_file_name(f, "dwconv2d_dgrad", "simt")
write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884")
write_merge_file_name(f, "dwconv2d_wgrad", "simt")
write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884")
write_merge_file_name(f, "gemm", "simt", 2)
write_merge_file_name(f, "gemm", "tensorop884", 30)
write_merge_file_name(f, "gemm", "tensorop1688", 2)
write_merge_file_name(f, "gemv", "simt", 2)
write_merge_file_name(f, "deconv", "simt", 2)
write_merge_file_name(f, "deconv", "tensorop8816", 4)
write_merge_file_name(f, "conv2d", "simt", 2)
write_merge_file_name(f, "conv2d", "tensorop8816", 4)
write_merge_file_name(f, "conv2d", "tensorop8832", 4)
write_merge_file_name(f, "dwconv2d_fprop", "simt", 2)
write_merge_file_name(f, "dwconv2d_fprop", "tensorop884", 4)
write_merge_file_name(f, "dwconv2d_dgrad", "simt", 2)
write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4)
write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2)
write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4)
f.write("]") f.write("]")

+ 207
- 90
dnn/scripts/cutlass_generator/generator.py View File

@@ -1656,108 +1656,219 @@ def GenerateGemvOperations(args):
) )
return GenerateGemv_Simt(args) return GenerateGemv_Simt(args)



def concat_file(file_path:str,file_name_first:str,file_name_last:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None):
################################################################################
# parameters
# split_number - the concated file will be divided into split_number parts
# file_path - the path of file, which is need to be concated
# operations - args.operations
# type - args.type
# head - the head in the file
# required_cuda_ver_major - required cuda major
# required_cuda_ver_minor - required cuda minjor
# epilogue - the epilogue in the file
# wrapper_path - wrapper path
################################################################################
def ConcatFile(split_number:int, file_path:str,operations:str,type:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None):
import os import os
meragefiledir = file_path meragefiledir = file_path
filenames=os.listdir(meragefiledir)
file1=open(file_path + '/{}_{}_1.cu'.format(file_name_first,file_name_last),'w')
file2=open(file_path + '/{}_{}_2.cu'.format(file_name_first,file_name_last),'w')
if wrapper_path is None:
file1.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
file2.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
else:
file1.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
file2.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
flag = 0
if "tensorop" in file_name_last:
filenames=os.listdir(meragefiledir)
# filter file
if "tensorop" in type:
sub_string_1 = "tensorop" sub_string_1 = "tensorop"
sub_string_2 = file_name_last[8:]
sub_string_2 = type[8:]
else: else:
sub_string_1 = sub_string_2 = "simt" sub_string_1 = sub_string_2 = "simt"
if "dwconv2d_" in file_name_first:
file_name_first = file_name_first[:2]+file_name_first[9:]
elif ("conv2d" in file_name_first) or ("deconv" in file_name_first):
file_name_first = "cutlass"
if "dwconv2d_" in operations:
filtered_operations = operations[:2]+operations[9:]
elif ("conv2d" in operations) or ("deconv" in operations):
filtered_operations = "cutlass"
else:
filtered_operations = operations
#get the file list number
file_list = {}
file_list[operations + type] = 0
for filename in filenames:
if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
file_list[operations + type] += 1
#concat file for linux
flag_1 = 0
flag_2 = 0
for filename in filenames: for filename in filenames:
if (file_name_first in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
flag += 1
if (filtered_operations in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename):
flag_1 += 1
filepath=meragefiledir+'/'+filename filepath=meragefiledir+'/'+filename
if flag <= len(filenames)/2:
if (flag_1 >= flag_2 * (file_list[operations + type]/split_number)) and (flag_1 <= (flag_2 + 1) * (file_list[operations + type]/split_number)):
file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a')
#write Template at the head
if wrapper_path is None:
file.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
else:
file.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
# concat all the remaining files
if flag_2 == (split_number - 1):
for line in open(filepath):
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
continue
for line in open(filepath): for line in open(filepath):
file1.writelines(line)
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
else: else:
#write Template at the head
if wrapper_path is None:
file.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
else:
file.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
for line in open(filepath): for line in open(filepath):
file2.writelines(line)
os.remove(filepath)
file1.write('\n')
file2.write('\n')
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
file.close()
flag_2 += 1


#concat file for windows
elif filename[0].isdigit() and ("all_" not in filename): elif filename[0].isdigit() and ("all_" not in filename):
flag += 1
flag_1 += 1
filepath=meragefiledir+'/'+filename filepath=meragefiledir+'/'+filename
if flag <= len(filenames)/2:
if (flag_1 >= flag_2 * (len(filenames)/split_number)) and (flag_1 <= (flag_2 + 1) * (len(filenames)/split_number)):
file =open(file_path + '/{}_{}_{}.cu'.format(operations,type, flag_2),'a')
#write Template at the head
if wrapper_path is None:
file.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
else:
file.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
# concat all the remaining files
if flag_2 == (split_number - 1):
for line in open(filepath):
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
continue
for line in open(filepath): for line in open(filepath):
file1.writelines(line)
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
else: else:
#write Template at the head
if wrapper_path is None:
file.write(
SubstituteTemplate(
head,
{
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
else:
file.write(
SubstituteTemplate(
head,
{
"wrapper_path": wrapper_path,
"required_cuda_ver_major": str(
required_cuda_ver_major
),
"required_cuda_ver_minor": str(
required_cuda_ver_minor
),
},
)
)
for line in open(filepath): for line in open(filepath):
file2.writelines(line)
os.remove(filepath)
file1.write('\n')
file2.write('\n')
file1.write(epilogue)
file2.write(epilogue)
file1.close()
file2.close()
file.writelines(line)
os.remove(filepath)
file.write('\n')
file.write(epilogue)
file.close()
flag_2 += 1


################################################################################################### ###################################################################################################
################################################################################################### ###################################################################################################
@@ -1833,7 +1944,10 @@ if __name__ == "__main__":
required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_major = operations[0].required_cuda_ver_major
required_cuda_ver_minor = operations[0].required_cuda_ver_minor required_cuda_ver_minor = operations[0].required_cuda_ver_minor
epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
if "tensorop" in args.type:
ConcatFile(4, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
else:
ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
elif args.operations == "gemm": elif args.operations == "gemm":
for operation in operations: for operation in operations:
with EmitGemmSingleKernelWrapper( with EmitGemmSingleKernelWrapper(
@@ -1844,7 +1958,10 @@ if __name__ == "__main__":
required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_major = operations[0].required_cuda_ver_major
required_cuda_ver_minor = operations[0].required_cuda_ver_minor required_cuda_ver_minor = operations[0].required_cuda_ver_minor
epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template
concat_file(args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
if args.type == "tensorop884":
ConcatFile(30, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
else:
ConcatFile(2, args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue)
elif args.operations == "gemv": elif args.operations == "gemv":
for operation in operations: for operation in operations:
with EmitGemvSingleKernelWrapper( with EmitGemvSingleKernelWrapper(
@@ -1855,7 +1972,7 @@ if __name__ == "__main__":
required_cuda_ver_major = operations[0].required_cuda_ver_major required_cuda_ver_major = operations[0].required_cuda_ver_major
required_cuda_ver_minor = operations[0].required_cuda_ver_minor required_cuda_ver_minor = operations[0].required_cuda_ver_minor
epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template
concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path)
ConcatFile(2, args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path)


if args.operations != "gemv": if args.operations != "gemv":
GenerateManifest(args, operations, args.output) GenerateManifest(args, operations, args.output)


+ 50
- 10
dnn/scripts/cutlass_generator/list.bzl View File

@@ -1,48 +1,88 @@
# Generated by dnn/scripts/cutlass_generator/gen_list.py # Generated by dnn/scripts/cutlass_generator/gen_list.py


cutlass_gen_list = [ cutlass_gen_list = [
"gemm_simt_0.cu",
"gemm_simt_1.cu", "gemm_simt_1.cu",
"gemm_simt_2.cu",
"all_gemm_simt_operations.cu", "all_gemm_simt_operations.cu",
"gemm_tensorop1688_1.cu",
"gemm_tensorop1688_2.cu",
"all_gemm_tensorop1688_operations.cu",
"gemm_tensorop884_0.cu",
"gemm_tensorop884_1.cu", "gemm_tensorop884_1.cu",
"gemm_tensorop884_2.cu", "gemm_tensorop884_2.cu",
"gemm_tensorop884_3.cu",
"gemm_tensorop884_4.cu",
"gemm_tensorop884_5.cu",
"gemm_tensorop884_6.cu",
"gemm_tensorop884_7.cu",
"gemm_tensorop884_8.cu",
"gemm_tensorop884_9.cu",
"gemm_tensorop884_10.cu",
"gemm_tensorop884_11.cu",
"gemm_tensorop884_12.cu",
"gemm_tensorop884_13.cu",
"gemm_tensorop884_14.cu",
"gemm_tensorop884_15.cu",
"gemm_tensorop884_16.cu",
"gemm_tensorop884_17.cu",
"gemm_tensorop884_18.cu",
"gemm_tensorop884_19.cu",
"gemm_tensorop884_20.cu",
"gemm_tensorop884_21.cu",
"gemm_tensorop884_22.cu",
"gemm_tensorop884_23.cu",
"gemm_tensorop884_24.cu",
"gemm_tensorop884_25.cu",
"gemm_tensorop884_26.cu",
"gemm_tensorop884_27.cu",
"gemm_tensorop884_28.cu",
"gemm_tensorop884_29.cu",
"all_gemm_tensorop884_operations.cu", "all_gemm_tensorop884_operations.cu",
"gemm_tensorop1688_0.cu",
"gemm_tensorop1688_1.cu",
"all_gemm_tensorop1688_operations.cu",
"gemv_simt_0.cu",
"gemv_simt_1.cu", "gemv_simt_1.cu",
"gemv_simt_2.cu",
"deconv_simt_0.cu",
"deconv_simt_1.cu", "deconv_simt_1.cu",
"deconv_simt_2.cu",
"all_deconv_simt_operations.cu", "all_deconv_simt_operations.cu",
"deconv_tensorop8816_0.cu",
"deconv_tensorop8816_1.cu", "deconv_tensorop8816_1.cu",
"deconv_tensorop8816_2.cu", "deconv_tensorop8816_2.cu",
"deconv_tensorop8816_3.cu",
"all_deconv_tensorop8816_operations.cu", "all_deconv_tensorop8816_operations.cu",
"conv2d_simt_0.cu",
"conv2d_simt_1.cu", "conv2d_simt_1.cu",
"conv2d_simt_2.cu",
"all_conv2d_simt_operations.cu", "all_conv2d_simt_operations.cu",
"conv2d_tensorop8816_0.cu",
"conv2d_tensorop8816_1.cu", "conv2d_tensorop8816_1.cu",
"conv2d_tensorop8816_2.cu", "conv2d_tensorop8816_2.cu",
"conv2d_tensorop8816_3.cu",
"all_conv2d_tensorop8816_operations.cu", "all_conv2d_tensorop8816_operations.cu",
"conv2d_tensorop8832_0.cu",
"conv2d_tensorop8832_1.cu", "conv2d_tensorop8832_1.cu",
"conv2d_tensorop8832_2.cu", "conv2d_tensorop8832_2.cu",
"conv2d_tensorop8832_3.cu",
"all_conv2d_tensorop8832_operations.cu", "all_conv2d_tensorop8832_operations.cu",
"dwconv2d_fprop_simt_0.cu",
"dwconv2d_fprop_simt_1.cu", "dwconv2d_fprop_simt_1.cu",
"dwconv2d_fprop_simt_2.cu",
"all_dwconv2d_fprop_simt_operations.cu", "all_dwconv2d_fprop_simt_operations.cu",
"dwconv2d_fprop_tensorop884_0.cu",
"dwconv2d_fprop_tensorop884_1.cu", "dwconv2d_fprop_tensorop884_1.cu",
"dwconv2d_fprop_tensorop884_2.cu", "dwconv2d_fprop_tensorop884_2.cu",
"dwconv2d_fprop_tensorop884_3.cu",
"all_dwconv2d_fprop_tensorop884_operations.cu", "all_dwconv2d_fprop_tensorop884_operations.cu",
"dwconv2d_dgrad_simt_0.cu",
"dwconv2d_dgrad_simt_1.cu", "dwconv2d_dgrad_simt_1.cu",
"dwconv2d_dgrad_simt_2.cu",
"all_dwconv2d_dgrad_simt_operations.cu", "all_dwconv2d_dgrad_simt_operations.cu",
"dwconv2d_dgrad_tensorop884_0.cu",
"dwconv2d_dgrad_tensorop884_1.cu", "dwconv2d_dgrad_tensorop884_1.cu",
"dwconv2d_dgrad_tensorop884_2.cu", "dwconv2d_dgrad_tensorop884_2.cu",
"dwconv2d_dgrad_tensorop884_3.cu",
"all_dwconv2d_dgrad_tensorop884_operations.cu", "all_dwconv2d_dgrad_tensorop884_operations.cu",
"dwconv2d_wgrad_simt_0.cu",
"dwconv2d_wgrad_simt_1.cu", "dwconv2d_wgrad_simt_1.cu",
"dwconv2d_wgrad_simt_2.cu",
"all_dwconv2d_wgrad_simt_operations.cu", "all_dwconv2d_wgrad_simt_operations.cu",
"dwconv2d_wgrad_tensorop884_0.cu",
"dwconv2d_wgrad_tensorop884_1.cu", "dwconv2d_wgrad_tensorop884_1.cu",
"dwconv2d_wgrad_tensorop884_2.cu", "dwconv2d_wgrad_tensorop884_2.cu",
"dwconv2d_wgrad_tensorop884_3.cu",
"all_dwconv2d_wgrad_tensorop884_operations.cu", "all_dwconv2d_wgrad_tensorop884_operations.cu",
] ]

Loading…
Cancel
Save