Browse Source

feat(dnn/cuda): add implicit bmm large kernel dwconv2d fprop impl

GitOrigin-RevId: feb09ebb58
release-1.8
Megvii Engine Team 王彪 3 years ago
parent
commit
96050073a2
21 changed files with 4706 additions and 2626 deletions
  1. +649
    -411
      dnn/scripts/cutlass_generator/conv2d_operation.py
  2. +1013
    -716
      dnn/scripts/cutlass_generator/gemm_operation.py
  3. +1580
    -824
      dnn/scripts/cutlass_generator/generator.py
  4. +470
    -423
      dnn/scripts/cutlass_generator/library.py
  5. +310
    -236
      dnn/scripts/cutlass_generator/manifest.py
  6. +2
    -0
      dnn/src/CMakeLists.txt
  7. +27
    -0
      dnn/src/cuda/conv_bias/algo.cpp
  8. +65
    -3
      dnn/src/cuda/conv_bias/algo.h
  9. +140
    -12
      dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
  10. +95
    -0
      dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
  11. +95
    -0
      dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
  12. +3
    -0
      dnn/src/cuda/conv_bias/opr_impl.h
  13. +3
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
  14. +3
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
  15. +3
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
  16. +4
    -0
      dnn/src/cuda/cutlass/initialize_all.cu
  17. +3
    -0
      dnn/src/cuda/cutlass/library.h
  18. +21
    -0
      dnn/src/cuda/cutlass/library_internal.h
  19. +4
    -0
      dnn/src/cuda/cutlass/operation_table.cpp
  20. +14
    -0
      dnn/src/cuda/cutlass/operation_table.h
  21. +202
    -1
      dnn/test/cuda/chanwise_convolution.cpp

+ 649
- 411
dnn/scripts/cutlass_generator/conv2d_operation.py
File diff suppressed because it is too large
View File


+ 1013
- 716
dnn/scripts/cutlass_generator/gemm_operation.py
File diff suppressed because it is too large
View File


+ 1580
- 824
dnn/scripts/cutlass_generator/generator.py
File diff suppressed because it is too large
View File


+ 470
- 423
dnn/scripts/cutlass_generator/library.py
File diff suppressed because it is too large
View File


+ 310
- 236
dnn/scripts/cutlass_generator/manifest.py View File

@@ -10,24 +10,25 @@ import shutil

from library import *
from gemm_operation import *
from conv2d_operation import *
from conv2d_operation import *

###################################################################################################


class EmitOperationKindLibrary:
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args

self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary
}
self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary,
OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
}

self.configurations = [];
self.configurations = []

self.header_template ="""
self.header_template = """
/*
Generated by manifest.py - Do not edit.
*/
@@ -42,17 +43,19 @@ namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////

"""
self.entry_template = """
self.entry_template = """

//
// Entry point to construct operations
//
void initialize_all_${operation_name}_operations(Manifest &manifest) {
"""
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
self.configuration_template =" initialize_${configuration_name}(manifest);\n"
self.configuration_prototype_template = (
"void initialize_${configuration_name}(Manifest &manifest);\n"
)
self.configuration_template = " initialize_${configuration_name}(manifest);\n"

self.epilogue_template ="""
self.epilogue_template = """

}

@@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) {

"""

#
def __enter__(self):
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
os.mkdir(self.operation_path)
#
def __enter__(self):
self.operation_path = os.path.join(
self.generated_path, OperationKindNames[self.kind]
)
os.mkdir(self.operation_path)

self.top_level_path = os.path.join(
self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]
)

self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)

self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
self.source_files = [self.top_level_path]

self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)
return self

self.source_files = [self.top_level_path,]
#
def emit(self, configuration_name, operations):

return self
with self.emitters[self.kind](
self.operation_path, configuration_name
) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)

#
def emit(self, configuration_name, operations):
self.source_files.append(configuration_emitter.configuration_path)

with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)
self.source_files.append(configuration_emitter.configuration_path)
self.configurations.append(configuration_name)
self.top_level_file.write(
SubstituteTemplate(
self.configuration_prototype_template,
{"configuration_name": configuration_name},
)
)

self.configurations.append(configuration_name)
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(
SubstituteTemplate(
self.entry_template, {"operation_name": OperationKindNames[self.kind]}
)
)

#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
for configuration_name in self.configurations:
self.top_level_file.write(
SubstituteTemplate(
self.configuration_template,
{"configuration_name": configuration_name},
)
)

for configuration_name in self.configurations:
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()

self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()

###################################################################################################
###################################################################################################


class Options:
def __init__(self):
pass
def __init__(self):
pass


###################################################################################################

#
class Manifest:

#
def __init__(self, args):
self.operations = {}
self.args = args
#
def __init__(self, args):
self.operations = {}
self.args = args

architectures = (
args.architectures.split(";") if len(args.architectures) else ["50"]
)
self.compute_capabilities = [int(x) for x in architectures]

architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
self.compute_capabilities = [int(x) for x in architectures]
self.selected_kernels = []
if args.operations == 'all':
self.operations_enabled = []
else:
self.selected_kernels = []

operations_list = [
OperationKind.Gemm
, OperationKind.Conv2d
]
if args.operations == "all":
self.operations_enabled = []
else:

self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
operations_list = [OperationKind.Gemm, OperationKind.Conv2d]

if args.kernels == 'all':
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(',') if x != '']
self.operations_enabled = [
x
for x in operations_list
if OperationKindNames[x] in args.operations.split(",")
]

self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
if args.kernels == "all":
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(",") if x != ""]

if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
self.ignore_kernel_names = [
x for x in args.ignore_kernels.split(",") if x != ""
]

if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)

self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = '''
self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = """

#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
@@ -159,208 +189,241 @@ ${prototypes}

void initialize_all(Manifest &manifest) {

'''
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n'
self.top_level_epilogue = '''
"""
self.top_level_reserve = " manifest.reserve(${operation_count});\n\n"
self.top_level_epilogue = """
}

} // namespace library
} // namespace cutlass

'''


def get_kernel_filters (self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, 'r') as fileReader:
lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
lines = [re.compile(line) for line in lines if line]
return lines
else:
return []
"""

def get_kernel_filters(self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, "r") as fileReader:
lines = [
line.rstrip() for line in fileReader if not line.startswith("#")
]

lines = [re.compile(line) for line in lines if line]
return lines
else:
return []

def filter_out_kernels(self, kernel_name, kernel_filter_list):
def filter_out_kernels(self, kernel_name, kernel_filter_list):

for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True
return False
for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True

#
def _filter_string_matches(self, filter_string, haystack):
''' Returns true if all substrings appear in the haystack in order'''
substrings = filter_string.split('*')
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub):]
return True

#
def filter(self, operation):
''' Filtering operations based on various criteria'''

# filter based on compute capability
enabled = False
for cc in self.compute_capabilities:
if cc >= operation.tile_description.minimum_compute_capability and \
cc <= operation.tile_description.maximum_compute_capability:

enabled = True
break

if not enabled:
return False

if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
return False

# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False

# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False

# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break

# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break
if len(self.kernel_filter_list) > 0:

#
def _filter_string_matches(self, filter_string, haystack):
""" Returns true if all substrings appear in the haystack in order"""
substrings = filter_string.split("*")
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub) :]
return True

#
def filter(self, operation):
""" Filtering operations based on various criteria"""

# filter based on compute capability
enabled = False
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
enabled = True
for cc in self.compute_capabilities:
if (
cc >= operation.tile_description.minimum_compute_capability
and cc <= operation.tile_description.maximum_compute_capability
):

enabled = True
break

if not enabled:
return False

if (
len(self.operations_enabled)
and not operation.operation_kind in self.operations_enabled
):
return False

# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False

# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False

# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break

# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break

if len(self.kernel_filter_list) > 0:
enabled = False
if self.filter_out_kernels(
operation.procedural_name(), self.kernel_filter_list
):
enabled = True

# todo: filter based on compute data type
return enabled

#

#
def append(self, operation):
"""
Inserts the operation.

operation_kind -> configuration_name -> []
"""

# todo: filter based on compute data type
return enabled
#
if self.filter(operation):

#
def append(self, operation):
'''
Inserts the operation.
self.selected_kernels.append(operation.procedural_name())

operation_kind -> configuration_name -> []
'''
self.operations_by_name[operation.procedural_name()] = operation

if self.filter(operation):
self.selected_kernels.append(operation.procedural_name())
# add the configuration
configuration_name = operation.configuration_name()

self.operations_by_name[operation.procedural_name()] = operation
if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}

# add the configuration
configuration_name = operation.configuration_name()
if (
configuration_name
not in self.operations[operation.operation_kind].keys()
):
self.operations[operation.operation_kind][configuration_name] = []

if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}
self.operations[operation.operation_kind][configuration_name].append(
operation
)
self.operation_count += 1

if configuration_name not in self.operations[operation.operation_kind].keys():
self.operations[operation.operation_kind][configuration_name] = []
#

self.operations[operation.operation_kind][configuration_name].append(operation)
self.operation_count += 1
#
#
def emit(self, target=GeneratorTarget.Library):

#
def emit(self, target = GeneratorTarget.Library):
operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary}

operation_emitters = {
GeneratorTarget.Library: EmitOperationKindLibrary
}
generated_path = os.path.join(self.args.curr_build_dir, "generated")

generated_path = os.path.join(self.args.curr_build_dir, 'generated')
# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)

# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)
os.mkdir(generated_path)

os.mkdir(generated_path)
source_files = []

source_files = []
top_level_path = os.path.join(generated_path, "initialize_all.cpp")
with open(top_level_path, "w") as top_level_file:

top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
with open(top_level_path, 'w') as top_level_file:
if target == GeneratorTarget.Library:
source_files.append(top_level_path)

if target == GeneratorTarget.Library:
source_files.append(top_level_path)
prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(
SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{"operation_kind": OperationKindNames[operation_kind]},
)
)

prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(
SubstituteTemplate(
self.top_level_prologue, {"prototypes": "\n".join(prototypes)}
)
)

top_level_file.write(SubstituteTemplate(self.top_level_prologue,
{'prototypes': "\n".join(prototypes)}))
top_level_file.write(
SubstituteTemplate(
self.top_level_reserve,
{"operation_count": str(self.operation_count)},
)
)

top_level_file.write(SubstituteTemplate(
self.top_level_reserve, {'operation_count': str(self.operation_count)}))
# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():

# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)
with operation_emitters[target](
generated_path, operation_kind, self.args
) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)

source_files += operation_kind_emitter.source_files
source_files += operation_kind_emitter.source_files

top_level_file.write(SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(
SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{"operation_kind": OperationKindNames[operation_kind]},
)
)

top_level_file.write(self.top_level_epilogue)
top_level_file.write(self.top_level_epilogue)

# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:
# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:

target_name = 'cutlass_library_objs'
target_name = "cutlass_library_objs"

target_text = SubstituteTemplate("""cutlass_target_sources(
target_text = SubstituteTemplate(
"""cutlass_target_sources(
${target_name}
BATCH_SOURCES ON
PRIVATE
""", { 'target_name': target_name})
""",
{"target_name": target_name},
)

manifest_file.write(target_text)
manifest_file.write(target_text)

for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace("\\", "/")))
manifest_file.write(")")

#

for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/')))
manifest_file.write(")")
#

###################################################################################################


def GenerateManifest(args, operations, output_dir):
assert isinstance(operations, list)
if len(operations) == 0:
return
op = operations[0]
required_cuda_ver_major = op.required_cuda_ver_major
required_cuda_ver_minor = op.required_cuda_ver_minor

manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
f = open(manifest_path, "w")
f.write("""
assert isinstance(operations, list)
if len(operations) == 0:
return
op = operations[0]
required_cuda_ver_major = op.required_cuda_ver_major
required_cuda_ver_minor = op.required_cuda_ver_minor

manifest_path = os.path.join(
output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)
)
f = open(manifest_path, "w")
f.write(
"""
/*
Generated by generator.py - Do not edit.
*/
@@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir):
namespace cutlass {
namespace library {

""" % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))

for op in operations:
f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

f.write("""
"""
% (
str(required_cuda_ver_major),
str(required_cuda_ver_major),
str(required_cuda_ver_minor),
)
)

for op in operations:
f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

f.write(
"""
void initialize_all_%s_%s_operations(Manifest &manifest) {
""" % (args.operations, args.type))
"""
% (args.operations, args.type)
)

for op in operations:
f.write(" initialize_%s(manifest);\n" % op.procedural_name())
for op in operations:
f.write(" initialize_%s(manifest);\n" % op.procedural_name())

f.write("""
f.write(
"""
}

} // namespace library
} // namespace cutlass

#endif
""")
f.close()
"""
)
f.close()

+ 2
- 0
dnn/src/CMakeLists.txt View File

@@ -181,6 +181,8 @@ if(MGE_WITH_CUDA)
gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES)
list(APPEND SOURCES ${CUTLASS_SOURCES})
list(APPEND SOURCES ${CUSOURCES})
endif()


+ 27
- 0
dnn/src/cuda/conv_bias/algo.cpp View File

@@ -92,6 +92,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : int8_nchw4_dotprod) {
all_algos.push_back(&algo);
}
fill_dwconv_algos();
all_algos.push_back(&int8_chwn4_dotprod);
all_algos.push_back(&fallback_nchw_qs8);
for (size_t i = all_algo_size; i < all_algos.size(); ++i) {
@@ -301,6 +302,32 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
}
#endif

void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() {
using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam;
f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2});
for (auto&& algo : f32_implicit_bmm) {
all_algos.push_back(&algo);
}
#if CUDA_VERSION >= 10020
f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
for (auto&& algo : f16_implicit_bmm) {
all_algos.push_back(&algo);
}
#endif
}

void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam;
int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2});


+ 65
- 3
dnn/src/cuda/conv_bias/algo.h View File

@@ -84,7 +84,9 @@ public:
CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
CUDA_FALLBACK_NCHW_INT4
CUDA_FALLBACK_NCHW_INT4,
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -503,6 +505,8 @@ public:
* +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm
* +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm
* +
* +--- AlgoFloat32NCHWImplicitBatchedGemm
* +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm
*/

/*
@@ -516,7 +520,13 @@ public:

// corresponds to cutlass::conv::ConvType. we hope that algo.h does not
// depend on cutlass headers
enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare };
enum class ConvType {
kConvolution,
kBatchConvolution,
kLocal,
kLocalShare,
kDepthwiseConvolution,
};

// common parameters for operation selection
struct AlgoParam {
@@ -558,7 +568,8 @@ public:
size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw,
const void* alpha, const void* beta, const void* gamma, const void* delta,
const void* theta, const void* threshold, const void* dst_scale,
cudaStream_t stream, const void* extra_param = nullptr) const;
cudaStream_t stream, const void* extra_param = nullptr,
size_t groups = 1) const;

protected:
AlgoParam m_algo_param;
@@ -992,6 +1003,54 @@ private:
};
#endif

class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
: public AlgoCutlassConvolutionBase {
public:
AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
: AlgoCutlassConvolutionBase(algo_param) {
m_name = ConvBias::algo_name<ConvBias::DirectParam>(
ssprintf(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str()),
ConvBias::DirectParam{});
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
return 0;
}
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); };
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32);

private:
std::string m_name;
};

class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
: public AlgoCutlassConvolutionBase {
public:
AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
: AlgoCutlassConvolutionBase(algo_param) {
m_name = ConvBias::algo_name<ConvBias::DirectParam>(
ssprintf(
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str()),
ConvBias::DirectParam{});
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
return 0;
}
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); };
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16);

private:
std::string m_name;
};

class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
@@ -1048,6 +1107,8 @@ public:
std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;
std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma;
#endif
std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm;
std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm;
AlgoGroupConvGeneral group;
AlgoBFloat16 bfloat16;

@@ -1063,6 +1124,7 @@ private:
#endif
void fill_cudnn_algos();
void fill_dp4a_algos();
void fill_dwconv_algos();
};

} // namespace cuda


+ 140
- 12
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp View File

@@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) {
return cutlass::conv::ConvType::kLocal;
case Base::ConvType::kLocalShare:
return cutlass::conv::ConvType::kLocalShare;
case Base::ConvType::kDepthwiseConvolution:
return cutlass::conv::ConvType::kDepthwiseConvolution;
default:
megdnn_assert(0, "invalid conv type");
}
}

NumericTypeID convert_dtype(DTypeEnum dtype) {
switch (dtype) {
NumericTypeID convert_dtype(DType dtype) {
// just make convolution with no bias happy
if (!dtype.valid())
return NumericTypeID::kF32;
switch (dtype.enumv()) {
case DTypeEnum::Float32:
return NumericTypeID::kF32;
case DTypeEnum::Float16:
@@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) {
}
}

NumericTypeID get_accumulator_dtype(
DType dtype, const param::ConvBias::ComputeMode comp_mode) {
if (dtype.category() == DTypeCategory::QUANTIZED) {
return NumericTypeID::kS32;
} else {
megdnn_assert(dtype.category() == DTypeCategory::FLOAT);
if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) {
return convert_dtype(dtype);
} else {
megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32);
return NumericTypeID::kF32;
}
}
}

struct LayoutPack {
LayoutTypeID src;
LayoutTypeID filter;
@@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type
default:
megdnn_assert(0, "invalid access_type");
}
case Format::NCHW:
return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW,
LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW};
default:
megdnn_assert(0, "invalid format");
}
@@ -177,6 +200,93 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla
megdnn_assert(0, "invalid nonlinear mode");
}

std::pair<int, int> get_tensor_alignment(
const param::ConvBias::Format format, const TensorLayout& src,
const TensorLayout& filter, const Base::AlgoParam& algo_param,
bool is_chanwise) {
int alignment_src = 0;
int alignment_filter = 0;

using Format = param::ConvBias::Format;

// get tensor alignment for tensor op operations
// for tensor op operations, the alignment is determined by the size of a vector
auto get_tensor_alignment_tensor_op = [&]() {
switch (format) {
/// case int8
case Format::NCHW32:
case Format::NCHW32_NCHW4:
alignment_src = 16;
alignment_filter = 16;
break;
/// case int4 or uint4
case Format::NCHW64:
alignment_src = 32;
alignment_filter = 32;
break;
case Format::NHWC:
alignment_src = alignment_filter = algo_param.access_size;
break;
default:
megdnn_throw("invalid format");
};
};

// get tensor alignment for dot product operations
// for integer dot product operations, alignment src is always 4
// and the alignment filter is determined by the threadblock shape
auto get_tensor_alignment_dp4a = [&]() {
megdnn_assert(
format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32);
alignment_src = 4;
// determine alignment filter
constexpr int warp_size = 32;
int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n *
algo_param.threadblock_k /
(algo_param.warp_m * algo_param.warp_n * algo_param.warp_k);
int threadblock_loads = filter.dtype.size(
algo_param.threadblock_m * algo_param.threadblock_n *
algo_param.threadblock_k);
int load_per_thread = threadblock_loads / threads;
if (load_per_thread >= 16)
alignment_filter = 16;
else if (load_per_thread >= 8)
alignment_filter = 8;
else {
megdnn_assert(load_per_thread >= 4);
alignment_filter = 4;
}
};

// get tensor alignment for depthwise convolution
auto get_tensor_alignment_dwconv2d_nchw = [&]() {
alignment_filter = 1;
size_t wi = src.dtype.size(src[3]); // width extent in bytes
for (size_t candidate : {16, 4, 2}) {
if (wi % candidate == 0) {
alignment_src = candidate;
break;
}
}
alignment_src /= src.dtype.size(1);
};

if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 ||
format == Format::NCHW64 || format == Format::NCHW64) {
get_tensor_alignment_tensor_op();
} else if (
format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) {
get_tensor_alignment_dp4a();
} else {
/// the following is used for depthwise convolution
megdnn_assert(format == Format::NCHW && is_chanwise);
get_tensor_alignment_dwconv2d_nchw();
}
megdnn_assert(alignment_src >= 1 && alignment_filter >= 1);
return {alignment_src, alignment_filter};
}
} // namespace

const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
@@ -185,23 +295,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
auto&& param = args.opr->param();
auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
auto epilogue_type = get_epilogue_type(
param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32);
param.nonlineMode,
args.dst_layout->dtype.category() != DTypeCategory::FLOAT);

cutlass::conv::SpecialOptimizeDesc special_optimization =
(use_conv_filter_unity_opt)
? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
: cutlass::conv::SpecialOptimizeDesc::NONE;

int alignment_src, alignment_filter;
auto&& fm = args.filter_meta;
bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 &&
fm.ocpg == 1;
std::tie(alignment_src, alignment_filter) = get_tensor_alignment(
param.format, *args.src_layout, *args.filter_layout, m_algo_param,
is_chanwise);

auto accumulator_dtype =
get_accumulator_dtype(args.src_layout->dtype, param.compute_mode);

ConvolutionKey key{
convert_conv_op(conv_op),
convert_dtype(args.src_layout->dtype.enumv()),
convert_dtype(args.src_layout->dtype),
layouts.src,
convert_dtype(args.filter_layout->dtype.enumv()),
convert_dtype(args.filter_layout->dtype),
layouts.filter,
convert_dtype(args.dst_layout->dtype.enumv()),
convert_dtype(args.dst_layout->dtype),
layouts.dst,
convert_dtype(args.bias_layout->dtype.enumv()),
convert_dtype(args.bias_layout->dtype),
layouts.bias,
accumulator_dtype,
convert_conv_type(conv_type),
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -215,6 +338,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
epilogue_type,
m_algo_param.stage,
special_optimization,
alignment_src,
alignment_filter,
without_shared_load};

return Singleton::get().operation_table.find_op(key);
@@ -227,13 +352,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op(
size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha,
const void* beta, const void* gamma, const void* delta, const void* theta,
const void* threshold, const void* dst_scale, cudaStream_t stream,
const void* extra_param) const {
const void* extra_param, size_t groups) const {
// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass::conv::Conv2dProblemSize problem_size{
int(n), int(hi), int(wi), int(ci),
int(co), int(fh), int(fw), int(ho),
int(wo), int(ph), int(pw), int(sh),
int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation};
int(n), int(hi), int(wi), int(ci),
int(co), int(fh), int(fw), int(ho),
int(wo), int(ph), int(pw), int(sh),
int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
int(groups), // groups
};

ConvolutionArguments conv_args{
problem_size, src, filter, bias, z, dst, alpha,


+ 95
- 0
dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp View File

@@ -0,0 +1,95 @@
/**
* \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;

bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
using Param = param::ConvBias;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float16 &&
check_bias_share_in_channel(*args.bias_layout, param.format)));
RETURN_IF_FALSE(
args.z_layout->ndim <= 0 ||
args.z_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
wi = args.src_layout->operator[](3);
size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
size_t co = fm.group;
size_t ci = co;
// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

// dummy parameters, used for quantization cases
float theta = 0.f;
float delta = 0.f;
float threshold = 0.f;

const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

UNPACK_CONV_PARAMETER(fm, param);
MARK_USED_VAR
execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, nullptr, stream, nullptr, fm.group);

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 95
- 0
dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp View File

@@ -0,0 +1,95 @@
/**
* \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;

bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
using Param = param::ConvBias;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
check_bias_share_in_channel(*args.bias_layout, param.format)));
RETURN_IF_FALSE(
args.z_layout->ndim <= 0 ||
args.z_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
wi = args.src_layout->operator[](3);
size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
size_t co = fm.group;
size_t ci = co;
// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

// dummy parameters, used for quantization cases
float theta = 0.f;
float delta = 0.f;
float threshold = 0.f;

const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

UNPACK_CONV_PARAMETER(fm, param);
MARK_USED_VAR
execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, nullptr, stream, nullptr, fm.group);

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 3
- 0
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -71,6 +71,9 @@ public:
class AlgoInt4Int4NHWCIMMAImplicitGemm;
class AlgoUInt4Int4NHWCIMMAImplicitGemm;
class AlgoBFloat16;
// The following algorithms are suitable for channel wise convolution
class AlgoFloat32NCHWFMAImplicitBatchedGemm;
class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

class AlgoPack;



+ 3
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
special_optimization,
4,
16,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 3
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp View File

@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
16,
64,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
2,
special_optimization,
4,
4,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 3
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp View File

@@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
LayoutTypeID::kTensorNHWC,
NumericTypeID::kS32,
LayoutTypeID::kTensorNHWC,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
special_optimization,
m_algo_param.access_size,
m_algo_param.access_size,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 4
- 0
dnn/src/cuda/cutlass/initialize_all.cu View File

@@ -54,24 +54,28 @@ namespace library {
void initialize_all_gemm_simt_operations(Manifest& manifest);
void initialize_all_conv2d_simt_operations(Manifest& manifest);
void initialize_all_deconv_simt_operations(Manifest& manifest);
void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
void initialize_all_deconv_tensorop8816_operations(Manifest& manifest);
void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest);
#endif

void initialize_all(Manifest& manifest) {
initialize_all_gemm_simt_operations(manifest);
initialize_all_conv2d_simt_operations(manifest);
initialize_all_deconv_simt_operations(manifest);
initialize_all_dwconv2d_fprop_simt_operations(manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
initialize_all_gemm_tensorop884_operations(manifest);
initialize_all_gemm_tensorop1688_operations(manifest);
initialize_all_conv2d_tensorop8816_operations(manifest);
initialize_all_conv2d_tensorop8832_operations(manifest);
initialize_all_deconv_tensorop8816_operations(manifest);
initialize_all_dwconv2d_fprop_tensorop884_operations(manifest);
#endif
}



+ 3
- 0
dnn/src/cuda/cutlass/library.h View File

@@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID {
kConvolutionFpropTrans,
kConvolutionDgradNCxHWx,
kConvolutionDgradTrans,
kDepthwiseConvolutionFprop,
kDepthwiseConvolutionDgrad,
kDepthwiseConvolutionWgrad,
kInvalid
};



+ 21
- 0
dnn/src/cuda/cutlass/library_internal.h View File

@@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap<
ThreadblockSwizzleID::kConvolutionDgradTrans;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionFprop;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionDgrad;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionWgrad;
};

/////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Element, typename Layout>


+ 4
- 0
dnn/src/cuda/cutlass/operation_table.cpp View File

@@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
key.layout_dst = desc.dst.layout;
key.element_bias = desc.bias.element;
key.layout_bias = desc.bias.layout;
key.element_accumulator =
desc.tile_description.math_instruction.element_accumulator;

key.convolution_type = desc.convolution_type;

@@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)

key.stages = desc.tile_description.threadblock_stages;
key.special_optimization = desc.special_optimization;
key.alignment_src = desc.src.alignment;
key.alignment_filter = desc.filter.alignment;
key.without_shared_load = desc.without_shared_load;

return key;


+ 14
- 0
dnn/src/cuda/cutlass/operation_table.h View File

@@ -188,6 +188,7 @@ struct ConvolutionKey {
library::LayoutTypeID layout_dst;
library::NumericTypeID element_bias;
library::LayoutTypeID layout_bias;
NumericTypeID element_accumulator;

conv::ConvType convolution_type;

@@ -206,6 +207,10 @@ struct ConvolutionKey {
epilogue::EpilogueType epilogue_type;
int stages;
conv::SpecialOptimizeDesc special_optimization;

int alignment_src;
int alignment_filter;

bool without_shared_load;

inline bool operator==(ConvolutionKey const& rhs) const {
@@ -215,6 +220,7 @@ struct ConvolutionKey {
(layout_filter == rhs.layout_filter) &&
(element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) &&
(element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) &&
(element_accumulator == rhs.element_accumulator) &&
(convolution_type == rhs.convolution_type) &&
(threadblock_shape_m == rhs.threadblock_shape_m) &&
(threadblock_shape_n == rhs.threadblock_shape_n) &&
@@ -227,6 +233,8 @@ struct ConvolutionKey {
(instruction_shape_k == rhs.instruction_shape_k) &&
(epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
(special_optimization == rhs.special_optimization) &&
(alignment_src == rhs.alignment_src) &&
(alignment_filter == rhs.alignment_filter) &&
(without_shared_load == rhs.without_shared_load);
}

@@ -254,6 +262,7 @@ struct ConvolutionKey {
"\n layout_dst: " + to_string(layout_dst) +
"\n element_bias: " + to_string(element_bias) +
"\n layout_bias: " + to_string(layout_bias) +
"\n element_accumulator: " + to_string(element_accumulator) +
"\n convolution_type: " + to_string(convolution_type) +
"\n threadblock_shape: " + threadblock_shape_str +
"\n warp_shape: " + warp_shape_str +
@@ -261,6 +270,8 @@ struct ConvolutionKey {
"\n epilogue_type: " + to_string(epilogue_type) +
"\n stages: " + std::to_string(stages) +
"\n special_optimization: " + to_string(special_optimization) +
"\n alignment_src: " + std::to_string(alignment_src) +
"\n alignment_filter: " + std::to_string(alignment_filter) +
"\n without_shared_load: " + to_string(without_shared_load) + "\n}";
}
};
@@ -278,6 +289,7 @@ struct ConvolutionKeyHasher {
.update(&key.layout_dst, sizeof(key.layout_dst))
.update(&key.element_bias, sizeof(key.element_bias))
.update(&key.layout_bias, sizeof(key.layout_bias))
.update(&key.element_accumulator, sizeof(key.element_accumulator))
.update(&key.convolution_type, sizeof(key.convolution_type))
.update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m))
.update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n))
@@ -291,6 +303,8 @@ struct ConvolutionKeyHasher {
.update(&key.epilogue_type, sizeof(key.epilogue_type))
.update(&key.stages, sizeof(key.stages))
.update(&key.special_optimization, sizeof(key.special_optimization))
.update(&key.alignment_src, sizeof(key.alignment_src))
.update(&key.alignment_filter, sizeof(key.alignment_filter))
.update(&key.without_shared_load, sizeof(key.without_shared_load))
.digest();
}


+ 202
- 1
dnn/test/cuda/chanwise_convolution.cpp View File

@@ -38,8 +38,10 @@ bool check_need_full_bench() {
}
#endif

Convolution::Param gconv_param(Convolution::Param p) {
Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
p.sparse = Convolution::Param::Sparse::GROUP;
if (io16xc32)
p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
return p;
}

@@ -421,6 +423,129 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
}
}

namespace {
template <typename Op>
struct AlgoCheckerMaker {
static auto make(const char* name, bool* require_algo) {
return AlgoChecker<Op>(name, require_algo);
}
};

template <>
struct AlgoCheckerMaker<ConvolutionForward> {
static auto make(const char* name, bool* require_algo) {
return AlgoChecker<ConvolutionForward>(
ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
name, {})
.c_str(),
{}}}},
require_algo);
}
};

template <typename Op>
void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
Checker<Op> checker(handle);
bool require_algo = false;
checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
bool io16xc32 = false;
if (io_type == dtype::Float16()) {
if (comp_type == dtype::Float16()) {
checker.set_epsilon(1e-1);
} else {
io16xc32 = true;
}
}
// dispatch testcase by operation
if (std::is_same<Op, ConvolutionForward>::value) {
// align 8
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
// align 1
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
// align 2
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
// custom padding
checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
// custom stride
checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
} else if (std::is_same<Op, ConvolutionBackwardData>::value) {
// align 8
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
// align 1
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
// align 2
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
// custom padding
checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
// custom stride
checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
} else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
}
}
} // namespace

#define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb) \
cb(1, 128, 128, 8, 32, 64, 8); \
cb(2, 128, 64, 8, 64, 32, 8); \
cb(3, 128, 32, 8, 64, 32, 8); \
cb(4, 64, 128, 8, 64, 32, 8); \
cb(5, 32, 128, 8, 32, 64, 8); \
cb(6, 64, 64, 8, 64, 32, 8); \
cb(7, 32, 64, 8, 32, 64, 8); \
cb(8, 32, 32, 8, 32, 32, 8); \
cb(9, 64, 32, 8, 64, 32, 8);

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
check_chanwise<ConvolutionForward>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb)

#undef cb
#undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL

#define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb) \
cb(1, 128, 128, 32, 32, 32, 32); \
cb(2, 128, 256, 32, 64, 64, 32); \
cb(3, 128, 64, 32, 32, 32, 32); \
cb(4, 64, 128, 32, 32, 32, 32); \
cb(5, 64, 64, 32, 32, 32, 32);

// check both ioc16 and io16xc32
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float32(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb)

#undef cb
#undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL

#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
auto handle = handle_cuda();
@@ -1123,6 +1248,82 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
// clang-format on
}

TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
size_t RUNS = 100;
bencher.set_display(false).set_times(RUNS);
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
bencher.set_proxy(proxy);

Convolution::Param param;
param.format = ConvBias::Param::Format::NCHW;
param.sparse = Convolution::Param::Sparse::GROUP;
NormalRNG rng;

auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
param.pad_h = f / 2;
param.pad_w = f / 2;
param.stride_h = s;
param.stride_w = s;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

TensorLayout dst_layout;
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(
{src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
float bandwith = static_cast<float>(
src.total_nr_elems() + filter.total_nr_elems() +
dst_layout.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

bencher.set_param(param)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;

bencher.set_param(param)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;

bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;

printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"float16: %.2fms %.2fGB/s "
"pseudo float16: %.2fms %.2fGB/s "
"speedup: "
"%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
time_in_ms_pseudo_fp16 / time_in_ms_fp16);
};

// clang-format off
for (size_t b : {32, 64})
for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
run(b, 384, 32, 32, f, 1);
run(b, 384, 64, 64, f, 1);
}
// clang-format on
}

#endif

// vim: syntax=cpp.doxygen

Loading…
Cancel
Save