Compare commits

...

18 Commits

Author SHA1 Message Date
  megvii-mge c42ce93705 feat(mge/third_party): update cutlass version 3 years ago
  温娟 9902ccfcb0 chore(release): bump version 3 years ago
  Megvii Engine Team 8e5410e41f feat(cuda): add fp16 compute 16 kernel 3 years ago
  Megvii Engine Team 472e2f9655 refactor(cuda): depthwish large kernel 3 years ago
  Megvii Engine Team e698ec20c2 feat(cuda): float16 depthwise large kernel conv compute fp32 3 years ago
  Megvii Engine Team 48406382ce feat(cuda): support float16 depthwise large kernel conv 3 years ago
  Megvii Engine Team 7042f76b34 perf(cuda): speedup conv backward data with small feature map and large filter size 3 years ago
  Megvii Engine Team 87a2aeebb1 perf(cuda): speedup chanwise conv with small feature map and large filter size 3 years ago
  Megvii Engine Team 2293385e93 feat(mge): add conv padding mode 3 years ago
  Megvii Engine Team afe9c4b50d feat(dnn/cuda): add implicit bmm kernels for large kernel depthwise convolution backward filter opr 3 years ago
  Megvii Engine Team e8a169292f feat(dnn/cuda): add heuristic rule for implicit batched gemm large kernel dwconv2d kernels 3 years ago
  Megvii Engine Team 38067472d2 fix(dnn/cuda): fix ci 3 years ago
  Megvii Engine Team 1da58ae17a feat(dnn/cuda): add implicit bmm large kernel dwconv2d dgrad kernels 3 years ago
  Megvii Engine Team 96050073a2 feat(dnn/cuda): add implicit bmm large kernel dwconv2d fprop impl 3 years ago
  温娟 19fe2e94e7 chore(release): bump version 3 years ago
  Megvii Engine Team 1add4517ad test(trace): test subtensor on unknown shape 3 years ago
  Megvii Engine Team 54eef55871 fix(trace): assume result is not scalar when shape is valid 3 years ago
  Megvii Engine Team 84d99d1cc4 fix(traced_module): fix Module compatible issue and traced module getattr check 3 years ago
81 changed files with 9433 additions and 3479 deletions
Split View
  1. +6
    -0
      dnn/scripts/cutlass_generator/BUILD
  2. +756
    -412
      dnn/scripts/cutlass_generator/conv2d_operation.py
  3. +1013
    -716
      dnn/scripts/cutlass_generator/gemm_operation.py
  4. +15
    -0
      dnn/scripts/cutlass_generator/gen_list.py
  5. +1636
    -824
      dnn/scripts/cutlass_generator/generator.py
  6. +471
    -423
      dnn/scripts/cutlass_generator/library.py
  7. +978
    -774
      dnn/scripts/cutlass_generator/list.bzl
  8. +310
    -236
      dnn/scripts/cutlass_generator/manifest.py
  9. +6
    -0
      dnn/src/CMakeLists.txt
  10. +31
    -0
      dnn/src/cuda/conv_bias/algo.cpp
  11. +87
    -4
      dnn/src/cuda/conv_bias/algo.h
  12. +179
    -0
      dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
  13. +914
    -0
      dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
  14. +56
    -0
      dnn/src/cuda/conv_bias/chanwise/fwd_large_filter.cu
  15. +13
    -6
      dnn/src/cuda/conv_bias/chanwise/kern.cuh
  16. +141
    -12
      dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
  17. +111
    -0
      dnn/src/cuda/conv_bias/depthwise_large_filter.cpp
  18. +99
    -0
      dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
  19. +99
    -0
      dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
  20. +14
    -1
      dnn/src/cuda/conv_bias/opr_impl.cpp
  21. +4
    -0
      dnn/src/cuda/conv_bias/opr_impl.h
  22. +36
    -0
      dnn/src/cuda/convolution/backward_data/algo.cpp
  23. +98
    -1
      dnn/src/cuda/convolution/backward_data/algo.h
  24. +89
    -0
      dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp
  25. +149
    -0
      dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp
  26. +145
    -0
      dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
  27. +16
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
  28. +3
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
  29. +3
    -0
      dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
  30. +34
    -0
      dnn/src/cuda/convolution/backward_filter/algo.cpp
  31. +81
    -0
      dnn/src/cuda/convolution/backward_filter/algo.h
  32. +172
    -0
      dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp
  33. +135
    -0
      dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp
  34. +54
    -0
      dnn/src/cuda/convolution/chanwise/bwd_large_filter.cu
  35. +13
    -6
      dnn/src/cuda/convolution/chanwise/kern.cuh
  36. +33
    -0
      dnn/src/cuda/convolution/opr_impl.cpp
  37. +5
    -0
      dnn/src/cuda/convolution/opr_impl.h
  38. +162
    -0
      dnn/src/cuda/cutlass/convolution_operation.h
  39. +23
    -2
      dnn/src/cuda/cutlass/initialize_all.cu
  40. +3
    -0
      dnn/src/cuda/cutlass/library.h
  41. +21
    -0
      dnn/src/cuda/cutlass/library_internal.h
  42. +4
    -0
      dnn/src/cuda/cutlass/operation_table.cpp
  43. +14
    -1
      dnn/src/cuda/cutlass/operation_table.h
  44. +2
    -0
      dnn/src/cuda/cutlass/util.cu
  45. +15
    -0
      dnn/src/cuda/fp16_help.cuh
  46. +14
    -9
      dnn/src/cuda/matrix_mul/algos.cpp
  47. +2
    -2
      dnn/src/cuda/matrix_mul/algos.h
  48. +1
    -1
      dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
  49. +1
    -1
      dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
  50. +2
    -0
      dnn/src/cuda/matrix_mul/opr_impl.h
  51. +1
    -0
      dnn/src/cuda/padding/opr_impl.cpp
  52. +3
    -1
      dnn/src/cuda/padding/padding.cu
  53. +6
    -3
      dnn/src/naive/padding/opr_impl.cpp
  54. +1
    -0
      dnn/test/common/checker.h
  55. +429
    -1
      dnn/test/cuda/chanwise_convolution.cpp
  56. +202
    -1
      dnn/test/cuda/conv_bias.cpp
  57. +202
    -0
      dnn/test/cuda/convolution.cpp
  58. +5
    -3
      dnn/test/cuda/cutlass_matmul.cpp
  59. +30
    -0
      dnn/test/cuda/padding.cpp
  60. +30
    -0
      dnn/test/naive/padding.cpp
  61. +9
    -5
      imperative/python/megengine/core/tensor/indexing.py
  62. +56
    -2
      imperative/python/megengine/module/conv.py
  63. +2
    -0
      imperative/python/megengine/module/conv_bn.py
  64. +7
    -5
      imperative/python/megengine/module/module.py
  65. +1
    -0
      imperative/python/megengine/module/qat/conv.py
  66. +1
    -0
      imperative/python/megengine/module/qat/conv_bn.py
  67. +23
    -1
      imperative/python/megengine/module/quantized/conv.py
  68. +1
    -0
      imperative/python/megengine/module/quantized/conv_bn.py
  69. +36
    -0
      imperative/python/megengine/traced_module/compat.py
  70. +1
    -1
      imperative/python/megengine/traced_module/serialization.py
  71. +6
    -4
      imperative/python/megengine/traced_module/traced_module.py
  72. +3
    -7
      imperative/python/src/tensor.cpp
  73. +27
    -0
      imperative/python/test/unit/core/test_indexing_op.py
  74. +14
    -0
      imperative/python/test/unit/core/test_serialization.py
  75. +24
    -0
      imperative/python/test/unit/module/test_module.py
  76. +19
    -2
      imperative/python/test/unit/module/test_qat.py
  77. +8
    -3
      imperative/python/test/unit/quantization/test_module.py
  78. +14
    -7
      imperative/src/impl/transformations/scalar.cpp
  79. +1
    -1
      src/core/include/megbrain/version.h
  80. +1
    -0
      src/megbrain_build_config.h.in
  81. +1
    -1
      third_party/cutlass

+ 6
- 0
dnn/scripts/cutlass_generator/BUILD View File

@@ -13,6 +13,12 @@ genrule(
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type simt $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8816 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8832 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type simt $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type tensorop884 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type simt $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D)
""",
tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"],
visibility = ["//visibility:public"],


+ 756
- 412
dnn/scripts/cutlass_generator/conv2d_operation.py
File diff suppressed because it is too large
View File


+ 1013
- 716
dnn/scripts/cutlass_generator/gemm_operation.py
File diff suppressed because it is too large
View File


+ 15
- 0
dnn/scripts/cutlass_generator/gen_list.py View File

@@ -3,6 +3,9 @@ from generator import (
GenerateGemvOperations,
GenerateConv2dOperations,
GenerateDeconvOperations,
GenerateDwconv2dFpropOperations,
GenerateDwconv2dDgradOperations,
GenerateDwconv2dWgradOperations,
)


@@ -21,6 +24,12 @@ def write_op_list(f, gen_op, gen_type):
operations = GenerateConv2dOperations(GenArg(gen_op, gen_type))
elif gen_op == "deconv":
operations = GenerateDeconvOperations(GenArg(gen_op, gen_type))
elif gen_op == "dwconv2d_fprop":
operations = GenerateDwconv2dFpropOperations(GenArg(gen_op, gen_type))
elif gen_op == "dwconv2d_dgrad":
operations = GenerateDwconv2dDgradOperations(GenArg(gen_op, gen_type))
elif gen_op == "dwconv2d_wgrad":
operations = GenerateDwconv2dWgradOperations(GenArg(gen_op, gen_type))
for op in operations:
f.write(' "%s.cu",\n' % op.procedural_name())
if gen_op != "gemv":
@@ -40,4 +49,10 @@ if __name__ == "__main__":
write_op_list(f, "conv2d", "simt")
write_op_list(f, "conv2d", "tensorop8816")
write_op_list(f, "conv2d", "tensorop8832")
write_op_list(f, "dwconv2d_fprop", "simt")
write_op_list(f, "dwconv2d_fprop", "tensorop884")
write_op_list(f, "dwconv2d_dgrad", "simt")
write_op_list(f, "dwconv2d_dgrad", "tensorop884")
write_op_list(f, "dwconv2d_wgrad", "simt")
write_op_list(f, "dwconv2d_wgrad", "tensorop884")
f.write("]")

+ 1636
- 824
dnn/scripts/cutlass_generator/generator.py
File diff suppressed because it is too large
View File


+ 471
- 423
dnn/scripts/cutlass_generator/library.py
File diff suppressed because it is too large
View File


+ 978
- 774
dnn/scripts/cutlass_generator/list.bzl
File diff suppressed because it is too large
View File


+ 310
- 236
dnn/scripts/cutlass_generator/manifest.py View File

@@ -10,24 +10,25 @@ import shutil

from library import *
from gemm_operation import *
from conv2d_operation import *
from conv2d_operation import *

###################################################################################################


class EmitOperationKindLibrary:
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args

self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary
}
self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary,
OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
}

self.configurations = [];
self.configurations = []

self.header_template ="""
self.header_template = """
/*
Generated by manifest.py - Do not edit.
*/
@@ -42,17 +43,19 @@ namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////

"""
self.entry_template = """
self.entry_template = """

//
// Entry point to construct operations
//
void initialize_all_${operation_name}_operations(Manifest &manifest) {
"""
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
self.configuration_template =" initialize_${configuration_name}(manifest);\n"
self.configuration_prototype_template = (
"void initialize_${configuration_name}(Manifest &manifest);\n"
)
self.configuration_template = " initialize_${configuration_name}(manifest);\n"

self.epilogue_template ="""
self.epilogue_template = """

}

@@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) {

"""

#
def __enter__(self):
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
os.mkdir(self.operation_path)
#
def __enter__(self):
self.operation_path = os.path.join(
self.generated_path, OperationKindNames[self.kind]
)
os.mkdir(self.operation_path)

self.top_level_path = os.path.join(
self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]
)

self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)

self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
self.source_files = [self.top_level_path]

self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)
return self

self.source_files = [self.top_level_path,]
#
def emit(self, configuration_name, operations):

return self
with self.emitters[self.kind](
self.operation_path, configuration_name
) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)

#
def emit(self, configuration_name, operations):
self.source_files.append(configuration_emitter.configuration_path)

with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)
self.source_files.append(configuration_emitter.configuration_path)
self.configurations.append(configuration_name)
self.top_level_file.write(
SubstituteTemplate(
self.configuration_prototype_template,
{"configuration_name": configuration_name},
)
)

self.configurations.append(configuration_name)
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(
SubstituteTemplate(
self.entry_template, {"operation_name": OperationKindNames[self.kind]}
)
)

#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
for configuration_name in self.configurations:
self.top_level_file.write(
SubstituteTemplate(
self.configuration_template,
{"configuration_name": configuration_name},
)
)

for configuration_name in self.configurations:
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()

self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()

###################################################################################################
###################################################################################################


class Options:
def __init__(self):
pass
def __init__(self):
pass


###################################################################################################

#
class Manifest:

#
def __init__(self, args):
self.operations = {}
self.args = args
#
def __init__(self, args):
self.operations = {}
self.args = args

architectures = (
args.architectures.split(";") if len(args.architectures) else ["50"]
)
self.compute_capabilities = [int(x) for x in architectures]

architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
self.compute_capabilities = [int(x) for x in architectures]
self.selected_kernels = []
if args.operations == 'all':
self.operations_enabled = []
else:
self.selected_kernels = []

operations_list = [
OperationKind.Gemm
, OperationKind.Conv2d
]
if args.operations == "all":
self.operations_enabled = []
else:

self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
operations_list = [OperationKind.Gemm, OperationKind.Conv2d]

if args.kernels == 'all':
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(',') if x != '']
self.operations_enabled = [
x
for x in operations_list
if OperationKindNames[x] in args.operations.split(",")
]

self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
if args.kernels == "all":
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(",") if x != ""]

if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
self.ignore_kernel_names = [
x for x in args.ignore_kernels.split(",") if x != ""
]

if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)

self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = '''
self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = """

#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
@@ -159,208 +189,241 @@ ${prototypes}

void initialize_all(Manifest &manifest) {

'''
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n'
self.top_level_epilogue = '''
"""
self.top_level_reserve = " manifest.reserve(${operation_count});\n\n"
self.top_level_epilogue = """
}

} // namespace library
} // namespace cutlass

'''


def get_kernel_filters (self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, 'r') as fileReader:
lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
lines = [re.compile(line) for line in lines if line]
return lines
else:
return []
"""

def get_kernel_filters(self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, "r") as fileReader:
lines = [
line.rstrip() for line in fileReader if not line.startswith("#")
]

lines = [re.compile(line) for line in lines if line]
return lines
else:
return []

def filter_out_kernels(self, kernel_name, kernel_filter_list):
def filter_out_kernels(self, kernel_name, kernel_filter_list):

for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True
return False
for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True

#
def _filter_string_matches(self, filter_string, haystack):
''' Returns true if all substrings appear in the haystack in order'''
substrings = filter_string.split('*')
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub):]
return True

#
def filter(self, operation):
''' Filtering operations based on various criteria'''

# filter based on compute capability
enabled = False
for cc in self.compute_capabilities:
if cc >= operation.tile_description.minimum_compute_capability and \
cc <= operation.tile_description.maximum_compute_capability:

enabled = True
break

if not enabled:
return False

if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
return False

# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False

# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False

# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break

# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break
if len(self.kernel_filter_list) > 0:

#
def _filter_string_matches(self, filter_string, haystack):
""" Returns true if all substrings appear in the haystack in order"""
substrings = filter_string.split("*")
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub) :]
return True

#
def filter(self, operation):
""" Filtering operations based on various criteria"""

# filter based on compute capability
enabled = False
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
enabled = True
for cc in self.compute_capabilities:
if (
cc >= operation.tile_description.minimum_compute_capability
and cc <= operation.tile_description.maximum_compute_capability
):

enabled = True
break

if not enabled:
return False

if (
len(self.operations_enabled)
and not operation.operation_kind in self.operations_enabled
):
return False

# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False

# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False

# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break

# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break

if len(self.kernel_filter_list) > 0:
enabled = False
if self.filter_out_kernels(
operation.procedural_name(), self.kernel_filter_list
):
enabled = True

# todo: filter based on compute data type
return enabled

#

#
def append(self, operation):
"""
Inserts the operation.

operation_kind -> configuration_name -> []
"""

# todo: filter based on compute data type
return enabled
#
if self.filter(operation):

#
def append(self, operation):
'''
Inserts the operation.
self.selected_kernels.append(operation.procedural_name())

operation_kind -> configuration_name -> []
'''
self.operations_by_name[operation.procedural_name()] = operation

if self.filter(operation):
self.selected_kernels.append(operation.procedural_name())
# add the configuration
configuration_name = operation.configuration_name()

self.operations_by_name[operation.procedural_name()] = operation
if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}

# add the configuration
configuration_name = operation.configuration_name()
if (
configuration_name
not in self.operations[operation.operation_kind].keys()
):
self.operations[operation.operation_kind][configuration_name] = []

if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}
self.operations[operation.operation_kind][configuration_name].append(
operation
)
self.operation_count += 1

if configuration_name not in self.operations[operation.operation_kind].keys():
self.operations[operation.operation_kind][configuration_name] = []
#

self.operations[operation.operation_kind][configuration_name].append(operation)
self.operation_count += 1
#
#
def emit(self, target=GeneratorTarget.Library):

#
def emit(self, target = GeneratorTarget.Library):
operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary}

operation_emitters = {
GeneratorTarget.Library: EmitOperationKindLibrary
}
generated_path = os.path.join(self.args.curr_build_dir, "generated")

generated_path = os.path.join(self.args.curr_build_dir, 'generated')
# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)

# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)
os.mkdir(generated_path)

os.mkdir(generated_path)
source_files = []

source_files = []
top_level_path = os.path.join(generated_path, "initialize_all.cpp")
with open(top_level_path, "w") as top_level_file:

top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
with open(top_level_path, 'w') as top_level_file:
if target == GeneratorTarget.Library:
source_files.append(top_level_path)

if target == GeneratorTarget.Library:
source_files.append(top_level_path)
prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(
SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{"operation_kind": OperationKindNames[operation_kind]},
)
)

prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(
SubstituteTemplate(
self.top_level_prologue, {"prototypes": "\n".join(prototypes)}
)
)

top_level_file.write(SubstituteTemplate(self.top_level_prologue,
{'prototypes': "\n".join(prototypes)}))
top_level_file.write(
SubstituteTemplate(
self.top_level_reserve,
{"operation_count": str(self.operation_count)},
)
)

top_level_file.write(SubstituteTemplate(
self.top_level_reserve, {'operation_count': str(self.operation_count)}))
# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():

# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)
with operation_emitters[target](
generated_path, operation_kind, self.args
) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)

source_files += operation_kind_emitter.source_files
source_files += operation_kind_emitter.source_files

top_level_file.write(SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(
SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{"operation_kind": OperationKindNames[operation_kind]},
)
)

top_level_file.write(self.top_level_epilogue)
top_level_file.write(self.top_level_epilogue)

# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:
# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:

target_name = 'cutlass_library_objs'
target_name = "cutlass_library_objs"

target_text = SubstituteTemplate("""cutlass_target_sources(
target_text = SubstituteTemplate(
"""cutlass_target_sources(
${target_name}
BATCH_SOURCES ON
PRIVATE
""", { 'target_name': target_name})
""",
{"target_name": target_name},
)

manifest_file.write(target_text)
manifest_file.write(target_text)

for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace("\\", "/")))
manifest_file.write(")")

#

for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/')))
manifest_file.write(")")
#

###################################################################################################


def GenerateManifest(args, operations, output_dir):
assert isinstance(operations, list)
if len(operations) == 0:
return
op = operations[0]
required_cuda_ver_major = op.required_cuda_ver_major
required_cuda_ver_minor = op.required_cuda_ver_minor

manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
f = open(manifest_path, "w")
f.write("""
assert isinstance(operations, list)
if len(operations) == 0:
return
op = operations[0]
required_cuda_ver_major = op.required_cuda_ver_major
required_cuda_ver_minor = op.required_cuda_ver_minor

manifest_path = os.path.join(
output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)
)
f = open(manifest_path, "w")
f.write(
"""
/*
Generated by generator.py - Do not edit.
*/
@@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir):
namespace cutlass {
namespace library {

""" % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))

for op in operations:
f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

f.write("""
"""
% (
str(required_cuda_ver_major),
str(required_cuda_ver_major),
str(required_cuda_ver_minor),
)
)

for op in operations:
f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

f.write(
"""
void initialize_all_%s_%s_operations(Manifest &manifest) {
""" % (args.operations, args.type))
"""
% (args.operations, args.type)
)

for op in operations:
f.write(" initialize_%s(manifest);\n" % op.procedural_name())
for op in operations:
f.write(" initialize_%s(manifest);\n" % op.procedural_name())

f.write("""
f.write(
"""
}

} // namespace library
} // namespace cutlass

#endif
""")
f.close()
"""
)
f.close()

+ 6
- 0
dnn/src/CMakeLists.txt View File

@@ -181,6 +181,12 @@ if(MGE_WITH_CUDA)
gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_dgrad simt CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES)
gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES)
list(APPEND SOURCES ${CUTLASS_SOURCES})
list(APPEND SOURCES ${CUSOURCES})
endif()


+ 31
- 0
dnn/src/cuda/conv_bias/algo.cpp View File

@@ -19,6 +19,7 @@ using namespace cuda;
ConvBiasForwardImpl::AlgoPack::AlgoPack() {
non_cudnn_algos.push_back(&chanwise);
non_cudnn_algos.push_back(&chanwise_small);
non_cudnn_algos.push_back(&depthwise_large_filter);

non_cudnn_algos.push_back(&inplace_matmul);
non_cudnn_algos.push_back(&matmul);
@@ -34,6 +35,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
std::vector<AlgoBase*> conv_algos;
conv_algos.push_back(&chanwise);
conv_algos.push_back(&chanwise_small);
conv_algos.push_back(&depthwise_large_filter);
conv_algos.push_back(&chanwise8x8x32);
for (auto&& algo : cudnn_convs) {
conv_algos.push_back(&algo);
@@ -92,6 +94,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : int8_nchw4_dotprod) {
all_algos.push_back(&algo);
}
fill_dwconv_algos();
all_algos.push_back(&int8_chwn4_dotprod);
all_algos.push_back(&fallback_nchw_qs8);
for (size_t i = all_algo_size; i < all_algos.size(); ++i) {
@@ -301,6 +304,34 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
}
#endif

void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() {
using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam;
/// preferred algo
f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2});
f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2});
for (auto&& algo : f32_implicit_bmm) {
all_algos.push_back(&algo);
}
#if CUDA_VERSION >= 10010
/// preferred algo
f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
for (auto&& algo : f16_implicit_bmm) {
all_algos.push_back(&algo);
}
#endif
}

void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam;
int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2});


+ 87
- 4
dnn/src/cuda/conv_bias/algo.h View File

@@ -22,7 +22,6 @@
#include "src/cuda/conv_bias/opr_impl.h"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/handle.h"

#include <cuda.h>
#include <memory>
@@ -57,6 +56,7 @@ public:
CUDA_CUDNN_CONVBIAS,
CUDA_CHANWISE,
CUDA_CHANWISE_SMALL,
CUDA_DEPTHWISE_LARGE_FILTER,
CUDA_CHANWISE_INT8X8X32,
CUDA_CUDNN_CONV,
CUDA_INPLACE_MATMUL,
@@ -84,7 +84,9 @@ public:
CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
CUDA_FALLBACK_NCHW_INT4
CUDA_FALLBACK_NCHW_INT4,
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -255,6 +257,26 @@ private:
mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoDepthwiseLargeFilter final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;

const char* name() const override {
if (m_name.empty()) {
m_name = ConvBiasForward::algo_name<DirectParam>(
"DEPTHWISE_LARGE_FILTER", {});
}
return m_name.c_str();
}
MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER)
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }

private:
mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
@@ -503,6 +525,8 @@ public:
* +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm
* +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm
* +
* +--- AlgoFloat32NCHWImplicitBatchedGemm
* +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm
*/

/*
@@ -516,7 +540,13 @@ public:

// corresponds to cutlass::conv::ConvType. we hope that algo.h does not
// depend on cutlass headers
enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare };
enum class ConvType {
kConvolution,
kBatchConvolution,
kLocal,
kLocalShare,
kDepthwiseConvolution,
};

// common parameters for operation selection
struct AlgoParam {
@@ -558,7 +588,8 @@ public:
size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw,
const void* alpha, const void* beta, const void* gamma, const void* delta,
const void* theta, const void* threshold, const void* dst_scale,
cudaStream_t stream, const void* extra_param = nullptr) const;
cudaStream_t stream, const void* extra_param = nullptr,
size_t groups = 1) const;

protected:
AlgoParam m_algo_param;
@@ -992,6 +1023,54 @@ private:
};
#endif

class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
: public AlgoCutlassConvolutionBase {
public:
AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
: AlgoCutlassConvolutionBase(algo_param) {
m_name = ConvBias::algo_name<ConvBias::DirectParam>(
ssprintf(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str()),
ConvBias::DirectParam{});
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
return 0;
}
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); };
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32);

private:
std::string m_name;
};

class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
: public AlgoCutlassConvolutionBase {
public:
AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
: AlgoCutlassConvolutionBase(algo_param) {
m_name = ConvBias::algo_name<ConvBias::DirectParam>(
ssprintf(
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str()),
ConvBias::DirectParam{});
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
return 0;
}
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); };
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16);

private:
std::string m_name;
};

class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
@@ -1025,6 +1104,7 @@ public:
AlgoFallbackNCHWQS8 fallback_nchw_qs8;
AlgoChanwise chanwise;
AlgoChanwiseSmall chanwise_small;
AlgoDepthwiseLargeFilter depthwise_large_filter;
AlgoChanwise8x8x32 chanwise8x8x32;
AlgoInplaceMatmul inplace_matmul;
AlgoMatmul matmul;
@@ -1048,6 +1128,8 @@ public:
std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;
std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma;
#endif
std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm;
std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm;
AlgoGroupConvGeneral group;
AlgoBFloat16 bfloat16;

@@ -1063,6 +1145,7 @@ private:
#endif
void fill_cudnn_algos();
void fill_dp4a_algos();
void fill_dwconv_algos();
};

} // namespace cuda


+ 179
- 0
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh View File

@@ -0,0 +1,179 @@
/**
* \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
namespace {
#define DIVUP(x, y) (((x) + (y)-1) / (y))
enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };

template <typename ThreadConfig_, int oh_, int ow_>
struct OutTileConfig {
using ThreadConfig = ThreadConfig_;
static int const unroll_h = oh_;
static int const unroll_w = ThreadConfig::thread_x * ow_;
static int const unroll_size = unroll_h * unroll_w;
static int const block_h = unroll_h * ThreadConfig::thread_y;
static int const block_w = unroll_w;
};

template <int fh_, int fw_>
struct FilterTileConfig {
static int const unroll_h = fh_;
static int const unroll_w = fw_;
static int const unroll_size = unroll_h * unroll_w;
};

template <int x_, int y_>
struct ThreadConfig {
static int const thread_x = x_;
static_assert((thread_x & (thread_x - 1)) == 0, "thread_x must be pow of 2!");
static int const thread_y = y_;
static int const nr_threads = x_ * y_;
};

template <
typename ldg_dtype, typename ThreadConfig_, typename OutTileConfig_,
typename FilterTileConfig_, int stride_w, int stride_h>
struct ConvTraitInner {
using ThreadConfig = ThreadConfig_;
using OutTileConfig = OutTileConfig_;
using FilterTileConfig = FilterTileConfig_;
using CompType = ldg_dtype;

struct SrcTileConfig {
static int const unroll_h =
OutTileConfig::unroll_h + FilterTileConfig::unroll_h - 1;
static int const unroll_w =
(OutTileConfig::unroll_w - 1) * stride_w + FilterTileConfig::unroll_w;
static int const unroll_size = unroll_h * unroll_w;
};

struct SrcTileCount {
static int const smem_src_h =
(OutTileConfig::block_h - 1) * stride_h + FilterTileConfig::unroll_h;
static int const smem_buff_h = FilterTileConfig::unroll_h;
static int const smem_load_h = smem_src_h + smem_buff_h;
static int const smem_h = smem_load_h + smem_buff_h;
static int const smem_w =
DIVUP((OutTileConfig::block_w - 1) * stride_w +
FilterTileConfig::unroll_w * ThreadConfig::thread_x,
2) *
2;
static int const smem_size = smem_h * smem_w;
static int const load_w =
smem_w > ThreadConfig::nr_threads ? ThreadConfig::nr_threads : smem_w;
static int const load_h = 1;
static int const reg_h = 1;
static int const reg_w = DIVUP(smem_w, load_w);
static bool constexpr check_bounds_h = smem_h % load_h != 0;
static bool constexpr check_bounds_w = smem_w % load_w != 0;
};

struct FilterTileCount {
static int const smem_flt_h = FilterTileConfig::unroll_h;
static int const smem_buff_h = FilterTileConfig::unroll_h;
static int const smem_load_h = smem_flt_h + smem_buff_h;
static int const smem_h = smem_load_h + smem_buff_h;
static int const smem_w = FilterTileConfig::unroll_w * ThreadConfig::thread_x;
static int const smem_size = smem_h * smem_w;
static int const load_w = smem_w > 32 ? 32 : smem_w;
static int const load_h = ThreadConfig::nr_threads / load_w;
static int const reg_h = 1;
static int const reg_w = DIVUP(smem_w, load_w);
static bool constexpr check_bounds_h = smem_h % load_h != 0;
static bool constexpr check_bounds_w = smem_w % load_w != 0;
};
};

#define CHECK_AB_FWD(a, b) \
if (param.out_w > b * 4) { \
if (param.stride_h == 1 && param.stride_w == 1) { \
using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \
using ThreadConfig_ = ThreadConfig<4, 32>; \
using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \
using IConvTrait = ConvTraitInner< \
float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>; \
using SrcTileConfig = typename IConvTrait::SrcTileConfig; \
using SrcTileCount = typename IConvTrait::SrcTileCount; \
using FilterTileCount = typename IConvTrait::FilterTileCount; \
\
if (device_prop.regsPerBlock < \
4 * 32 * \
(FilterTileConfig_::unroll_h * \
FilterTileConfig_::unroll_w * 2 + \
SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \
device_prop.sharedMemPerBlock < \
static_cast<size_t>( \
(SrcTileCount::smem_size + \
FilterTileCount::smem_size))) { \
return false; \
} \
return true; \
} else if (param.stride_h == 2 && param.stride_w == 2) { \
using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \
using ThreadConfig_ = ThreadConfig<4, 32>; \
using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \
using IConvTrait = ConvTraitInner< \
float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 2, 2>; \
using SrcTileConfig = typename IConvTrait::SrcTileConfig; \
using SrcTileCount = typename IConvTrait::SrcTileCount; \
using FilterTileCount = typename IConvTrait::FilterTileCount; \
\
if (device_prop.regsPerBlock < \
4 * 32 * \
(FilterTileConfig_::unroll_h * \
FilterTileConfig_::unroll_w * 2 + \
SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \
device_prop.sharedMemPerBlock < \
static_cast<size_t>( \
(SrcTileCount::smem_size + \
FilterTileCount::smem_size))) { \
return false; \
} \
return true; \
} \
}

#define CHECK_AB_BWD(a, b) \
if (param.out_w > b * 4) { \
using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \
using ThreadConfig_ = ThreadConfig<4, 32>; \
using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \
using IConvTrait = ConvTraitInner< \
float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>; \
using SrcTileConfig = typename IConvTrait::SrcTileConfig; \
using SrcTileCount = typename IConvTrait::SrcTileCount; \
using FilterTileCount = typename IConvTrait::FilterTileCount; \
\
if (device_prop.regsPerBlock < \
4 * 32 * \
(FilterTileConfig_::unroll_h * \
FilterTileConfig_::unroll_w * 2 + \
SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \
device_prop.sharedMemPerBlock < \
static_cast<size_t>( \
(SrcTileCount::smem_size + FilterTileCount::smem_size))) { \
return false; \
} \
return true; \
}

#define CHECK_A(a, cb) \
if (param.flt_w > a * 4) { \
CHECK_AB_##cb( \
a, \
15) else CHECK_AB_##cb(a, 14) else CHECK_AB_##cb(a, 13) else CHECK_AB_##cb(a, 12) else CHECK_AB_##cb(a, 11) else CHECK_AB_##cb(a, 10) else CHECK_AB_##cb(a, 9) else CHECK_AB_##cb(a, 8) else CHECK_AB_##cb(a, 7) else CHECK_AB_##cb(a, 6) else CHECK_AB_##cb(a, 5) else CHECK_AB_##cb(a, 4) else CHECK_AB_##cb(a, 3) else CHECK_AB_##cb(a, 2) else CHECK_AB_##cb(a, 1) else CHECK_AB_##cb(a, 0) \
}

#define CHECK(cb) \
CHECK_A(6, cb) \
else CHECK_A(4, cb) else CHECK_A(2, cb) else CHECK_A(0, cb)

} // namespace

+ 914
- 0
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh View File

@@ -0,0 +1,914 @@
/**
* \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "depthwise_large_filter.cuh"
#include "src/cuda/cuda_shfl_compat.cuh"

namespace {

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
struct Global2SharedMem {
using TileCount = TileCount_;
using ThreadConfig = ThreadConfig_;
T reg[TileCount::reg_w];
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int tid = tidy * ThreadConfig::thread_x + tidx;
const int gl_load_y = tid / TileCount::load_w;
const int gl_load_x = tid - gl_load_y * TileCount::load_w;
const bool is_fwd = (kDirection == DIRECTION_FORWARD);
int w_offset;

T* smem;
int stride;
int start_h, start_w, bound_h, bound_w, ring_smem_h, ring_src_h;
// just used in backward src data
int stride_h, stride_w;
const T* g_ptr;

__device__ __forceinline__ Global2SharedMem(
T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_,
int stride_w_);

__device__ __forceinline__ void first_copy();
__device__ __forceinline__ void copy();
__device__ __forceinline__ void commit();
__device__ __forceinline__ void iter_forward();
__device__ __forceinline__ T* sh_ptr(int y, int x) {
return &smem[y * TileCount::smem_w + x];
}

__device__ __forceinline__ T* sh_ptr_as_copy_t(int y, int x) {
return reinterpret_cast<T*>(sh_ptr(y, x));
}
};

template <
typename ldg_dtype, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename OutTileConfig_, typename FilterTileConfig_, int stride_w, int stride_h>
struct ConvTrait {
using ThreadConfig = ThreadConfig_;
using OutTileConfig = OutTileConfig_;
using FilterTileConfig = FilterTileConfig_;
using CompType = ldg_dtype;

using CI = ConvTraitInner<
ldg_dtype, ThreadConfig_, OutTileConfig_, FilterTileConfig_, stride_w,
stride_h>;
using SrcTileConfig = typename CI::SrcTileConfig;
using SrcTileCount = typename CI::SrcTileCount;
using FilterTileCount = typename CI::FilterTileCount;

using SrcGlobal2ShareVisitor = Global2SharedMem<
CompType, DepthwiseConv2dDirection::DIRECTION_FORWARD, ThreadConfig,
SrcTileCount>;
using FilterGlobal2ShareVisitor =
Global2SharedMem<CompType, kDirection, ThreadConfig, FilterTileCount>;
};

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
__device__ __forceinline__
Global2SharedMem<T, kDirection, ThreadConfig_, TileCount_>::Global2SharedMem(
T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_,
int stride_w_)
: smem(smem_),
stride(stride_),
start_h(s_h),
start_w(s_w),
bound_h(b_h),
bound_w(b_w),
ring_smem_h(TileCount::smem_load_h),
stride_h(stride_h_),
stride_w(stride_w_) {
if (is_fwd) {
ring_src_h = s_h + TileCount::smem_load_h;
w_offset = 0;
} else {
ring_src_h = s_h - 1;
w_offset = TileCount::smem_w - b_w;
// stride_h and stride_w just used in backward src data.
stride_h = stride_w = 1;
}
}

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
__device__ __forceinline__ void Global2SharedMem<
T, kDirection, ThreadConfig_, TileCount_>::first_copy() {
static int const load_w = TileCount::smem_w > 32 ? 32 : TileCount::smem_w;
static int const load_h = ThreadConfig::nr_threads / load_w;
static int const h_per_thread = DIVUP(TileCount::smem_load_h, load_h);
static int const w_per_thread = DIVUP(TileCount::smem_w, load_w);
static bool constexpr check_bounds_h = TileCount::smem_load_h % load_h != 0;
static bool constexpr check_bounds_w = TileCount::smem_w % load_w != 0;
const int y_base_idx = tid / load_w;
const int x_base_idx = tid - y_base_idx * load_w;
#pragma unroll
for (int i = 0; i < h_per_thread; ++i) {
int smem_h_idx = y_base_idx + i * load_h;
int src_h_idx;
if (is_fwd) {
src_h_idx = start_h + smem_h_idx;
} else {
src_h_idx = start_h + TileCount::smem_load_h - smem_h_idx - 1;
}
if (check_bounds_h && smem_h_idx >= TileCount::smem_load_h)
continue;
#pragma unroll
for (int j = 0; j < w_per_thread; ++j) {
int smem_w_idx = x_base_idx + j * load_w;
int src_w_idx;
if (is_fwd) {
src_w_idx = start_w + smem_w_idx;
} else {
src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1;
}
if (check_bounds_w && smem_w_idx >= TileCount::smem_w)
continue;
T val = 0.0f;
if (src_h_idx >= 0 && src_h_idx < bound_h && src_w_idx >= 0 &&
src_w_idx < bound_w &&
((is_fwd && src_h_idx % stride_h == 0 && src_w_idx % stride_w == 0) ||
(!is_fwd && TileCount::smem_load_h - smem_h_idx - 1 >= 0 &&
TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) {
val = g_ptr[src_h_idx / stride_h * stride + src_w_idx / stride_w];
}
*(sh_ptr_as_copy_t(smem_h_idx, smem_w_idx)) = val;
}
}
}

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
__device__ __forceinline__ void Global2SharedMem<
T, kDirection, ThreadConfig_, TileCount_>::copy() {
#pragma unroll
for (int j = 0; j < TileCount::reg_w; ++j) {
int smem_w_idx = gl_load_x + j * TileCount::load_w;
int src_w_idx;
if (is_fwd) {
src_w_idx = start_w + smem_w_idx;
} else {
src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1;
}
if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w)
continue;
T val = 0.0f;
if (ring_src_h >= 0 && ring_src_h < bound_h && src_w_idx >= 0 &&
src_w_idx < bound_w &&
((is_fwd && ring_src_h % stride_h == 0 && src_w_idx % stride_w == 0) ||
(!is_fwd && TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) {
val = g_ptr[ring_src_h / stride_h * stride + src_w_idx / stride_w];
}
reg[j] = val;
}
}

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
__device__ __forceinline__ void Global2SharedMem<
T, kDirection, ThreadConfig_, TileCount_>::commit() {
#pragma unroll
for (int j = 0; j < TileCount::reg_w; ++j) {
int smem_w_idx = gl_load_x + j * TileCount::load_w;

if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w)
continue;

*(sh_ptr_as_copy_t(ring_smem_h, smem_w_idx)) = reg[j];
}
}

template <
typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
typename TileCount_>
__device__ __forceinline__ void Global2SharedMem<
T, kDirection, ThreadConfig_, TileCount_>::iter_forward() {
if (is_fwd) {
ring_src_h++;
} else {
ring_src_h--;
}
ring_smem_h = (ring_smem_h + 1) % TileCount::smem_h;
}

// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
// Padding must be 'SAME', which allows to reuse the index computation. Only
// use this kernel if CanLaunchDepthwiseConv2dGPU(args) returns true.
// Tiles of the input and filter tensors are loaded into shared memory before
// performing the convolution. Each thread handles two elements per iteration,
// one each in the lower and upper half of a tile.
// Backprop input direction is the same as forward direction with the filter
// rotated by 180°.
#if CUDA_VERSION >= 9000
template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
__global__ void DepthwiseConv2dGPUKernelNCHW(
const Param param, const __half* input, const __half* filter, __half* output) {
using T = __half;
using T2 = __half2;
using ThreadConfig = typename ConvTrait::ThreadConfig;
using SrcTileConfig = typename ConvTrait::SrcTileConfig;
using FilterTileConfig = typename ConvTrait::FilterTileConfig;
using OutTileConfig = typename ConvTrait::OutTileConfig;
using SrcTileCount = typename ConvTrait::SrcTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
off_oh = threadIdx.y, off_ow = threadIdx.x;

constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2;
constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2;
constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2;

extern __shared__ __align__(8) unsigned char smem[];
static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
T* smem_src = reinterpret_cast<T*>(smem);
T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
int stride_h = is_fwd ? param.stride_h : 1;
int stride_w = is_fwd ? param.stride_w : 1;

int off_ichannel = off_ochannel / param.chl_mul,
off_fchannel = off_ichannel % param.src_chl,
out_start_h = off_obh * OutTileConfig::block_h,
out_start_w = off_obw * OutTileConfig::block_w,
src_start_h = out_start_h * stride_h - param.pad_h,
src_start_w = out_start_w * stride_w - param.pad_w,
out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

SrcGlobal2ShareVisitor gl2sh_src = {
smem_src,
param.src_w,
is_fwd ? src_start_h
: src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
param.src_h * param.stride_h / 2),
is_fwd ? src_start_w
: src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
param.src_w * param.stride_w / 2),
is_fwd ? param.src_h : param.src_h * param.stride_h,
is_fwd ? param.src_w : param.src_w * param.stride_w,
is_fwd ? 1 : param.stride_h,
is_fwd ? 1 : param.stride_w};

FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
param.flt_w,
is_fwd ? 0 : param.flt_h - 2,
0,
param.flt_h,
param.flt_w,
1,
1};

gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

gl2sh_src.first_copy();
gl2sh_flt.first_copy();

__syncthreads();

T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w],
reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w];

T2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}};

for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
gl2sh_src.copy();
gl2sh_flt.copy();
#pragma unroll
for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
#pragma unroll
for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) {
int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
SrcTileCount::smem_w +
s_w * 2;
reg_src[s_h * t2_src_unroll_w + s_w] =
*reinterpret_cast<T2*>(smem_src_ptr + src_offset);
}
}

#pragma unroll
for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
#pragma unroll
for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) {
int flt_offset =
(fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w +
f_w * 2;
reg_flt[0][f_h * t2_flt_unroll_w + f_w] =
*reinterpret_cast<T2*>(smem_flt_ptr + flt_offset);
if (f_w > 0) {
reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y,
reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
} else {
reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
}
}
reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0};
reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] =
T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0};
}

#pragma unroll
for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
#pragma unroll
for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
#pragma unroll
for (int fw = 0; fw < t2_flt_unroll_w; ++fw) {
#pragma unroll
for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2(
reg_flt[ow * stride_w % 2]
[inner_fh * t2_flt_unroll_w + fw],
reg_src[(inner_fh + oh) * t2_src_unroll_w + fw +
ow * stride_w / 2],
sum[oh * t2_out_unroll_w + ow]);
}
}
}
}

__syncthreads();
gl2sh_src.commit();
gl2sh_flt.commit();
gl2sh_src.iter_forward();
gl2sh_flt.iter_forward();
__syncthreads();
}

for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
sum[o] = megdnn::cuda::hadd2(sum[o], __shfl_xor(sum[o], i, 32));
}
}

if (threadIdx.x == 0) {
#pragma unroll
for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
int out_h_idx = out_base_h_idx + i;
if (out_h_idx < param.out_h) {
#pragma unroll
for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
int out_w_idx = out_start_w + j;
if (out_w_idx >= param.out_w)
return;
out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half(
__half2float(sum[i * OutTileConfig::unroll_w + j].x) +
__half2float(sum[i * OutTileConfig::unroll_w + j].y));
}
}
}
}
}

template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
__global__ void DepthwiseConv2dGPUKernelNCHWC32(
const Param param, const __half* input, const __half* filter, __half* output) {
using T = __half;
using T2 = __half2;
using ThreadConfig = typename ConvTrait::ThreadConfig;
using SrcTileConfig = typename ConvTrait::SrcTileConfig;
using FilterTileConfig = typename ConvTrait::FilterTileConfig;
using OutTileConfig = typename ConvTrait::OutTileConfig;
using SrcTileCount = typename ConvTrait::SrcTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
off_oh = threadIdx.y, off_ow = threadIdx.x;

constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2;
constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2;
constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2;

extern __shared__ __align__(8) unsigned char smem[];
static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
T* smem_src = reinterpret_cast<T*>(smem);
T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
int stride_h = is_fwd ? param.stride_h : 1;
int stride_w = is_fwd ? param.stride_w : 1;

int off_ichannel = off_ochannel / param.chl_mul,
off_fchannel = off_ichannel % param.src_chl,
out_start_h = off_obh * OutTileConfig::block_h,
out_start_w = off_obw * OutTileConfig::block_w,
src_start_h = out_start_h * stride_h - param.pad_h,
src_start_w = out_start_w * stride_w - param.pad_w,
out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

SrcGlobal2ShareVisitor gl2sh_src = {
smem_src,
param.src_w,
is_fwd ? src_start_h
: src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
param.src_h * param.stride_h / 2),
is_fwd ? src_start_w
: src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
param.src_w * param.stride_w / 2),
is_fwd ? param.src_h : param.src_h * param.stride_h,
is_fwd ? param.src_w : param.src_w * param.stride_w,
is_fwd ? 1 : param.stride_h,
is_fwd ? 1 : param.stride_w};

FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
param.flt_w,
is_fwd ? 0 : param.flt_h - 2,
0,
param.flt_h,
param.flt_w,
1,
1};

gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

gl2sh_src.first_copy();
gl2sh_flt.first_copy();

__syncthreads();

T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w],
reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w];

float2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}};

for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
gl2sh_src.copy();
gl2sh_flt.copy();
#pragma unroll
for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
#pragma unroll
for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) {
int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
SrcTileCount::smem_w +
s_w * 2;
reg_src[s_h * t2_src_unroll_w + s_w] =
*reinterpret_cast<T2*>(smem_src_ptr + src_offset);
}
}

#pragma unroll
for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
#pragma unroll
for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) {
int flt_offset =
(fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w +
f_w * 2;
reg_flt[0][f_h * t2_flt_unroll_w + f_w] =
*reinterpret_cast<T2*>(smem_flt_ptr + flt_offset);
if (f_w > 0) {
reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y,
reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
} else {
reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
}
}
reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0};
reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] =
T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0};
}

#pragma unroll
for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
#pragma unroll
for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
#pragma unroll
for (int fw = 0; fw < t2_flt_unroll_w; ++fw) {
#pragma unroll
for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2(
reg_flt[ow * stride_w % 2]
[inner_fh * t2_flt_unroll_w + fw],
reg_src[(inner_fh + oh) * t2_src_unroll_w + fw +
ow * stride_w / 2],
sum[oh * t2_out_unroll_w + ow]);
}
}
}
}

__syncthreads();
gl2sh_src.commit();
gl2sh_flt.commit();
gl2sh_src.iter_forward();
gl2sh_flt.iter_forward();
__syncthreads();
}

for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
sum[o].x += __shfl_xor(sum[o].x, i, 32);
sum[o].y += __shfl_xor(sum[o].y, i, 32);
}
}

if (threadIdx.x == 0) {
#pragma unroll
for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
int out_h_idx = out_base_h_idx + i;
if (out_h_idx < param.out_h) {
#pragma unroll
for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
int out_w_idx = out_start_w + j;
if (out_w_idx >= param.out_w)
return;
out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half(
sum[i * OutTileConfig::unroll_w + j].x +
sum[i * OutTileConfig::unroll_w + j].y);
}
}
}
}
}
#endif

template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
__global__ void DepthwiseConv2dGPUKernelNCHW(
const Param param, const float* input, const float* filter, float* output) {
using T = float;
using T2 = float2;
using ThreadConfig = typename ConvTrait::ThreadConfig;
using SrcTileConfig = typename ConvTrait::SrcTileConfig;
using FilterTileConfig = typename ConvTrait::FilterTileConfig;
using OutTileConfig = typename ConvTrait::OutTileConfig;
using SrcTileCount = typename ConvTrait::SrcTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
off_oh = threadIdx.y, off_ow = threadIdx.x;

extern __shared__ __align__(8) unsigned char smem[];
static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
T* smem_src = reinterpret_cast<T*>(smem);
T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
int stride_h = is_fwd ? param.stride_h : 1;
int stride_w = is_fwd ? param.stride_w : 1;

int off_ichannel = off_ochannel / param.chl_mul,
off_fchannel = off_ichannel % param.src_chl,
out_start_h = off_obh * OutTileConfig::block_h,
out_start_w = off_obw * OutTileConfig::block_w,
src_start_h = out_start_h * stride_h - param.pad_h,
src_start_w = out_start_w * stride_w - param.pad_w,
out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

SrcGlobal2ShareVisitor gl2sh_src = {
smem_src,
param.src_w,
is_fwd ? src_start_h
: src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
param.src_h * param.stride_h / 2),
is_fwd ? src_start_w
: src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
param.src_w * param.stride_w / 2),
is_fwd ? param.src_h : param.src_h * param.stride_h,
is_fwd ? param.src_w : param.src_w * param.stride_w,
is_fwd ? 1 : param.stride_h,
is_fwd ? 1 : param.stride_w};

FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
param.flt_w,
is_fwd ? 0 : param.flt_h - 2,
0,
param.flt_h,
param.flt_w,
1,
1};

gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

gl2sh_src.first_copy();
gl2sh_flt.first_copy();

__syncthreads();

T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w],
reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w];

T sum[OutTileConfig::unroll_size] = {0.0};

for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
gl2sh_src.copy();
gl2sh_flt.copy();
#pragma unroll
for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
#pragma unroll
for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) {
reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr
[(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
SrcTileCount::smem_w +
s_w];
}
}

#pragma unroll
for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
#pragma unroll
for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) {
reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr
[(fh + f_h) % FilterTileCount::smem_h *
FilterTileCount::smem_w +
f_w];
}
}

#pragma unroll
for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
#pragma unroll
for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
#pragma unroll
for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) {
#pragma unroll
for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
sum[oh * OutTileConfig::unroll_w + ow] +=
reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] *
reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw +
ow * stride_w];
}
}
}
}

__syncthreads();
gl2sh_src.commit();
gl2sh_flt.commit();
gl2sh_src.iter_forward();
gl2sh_flt.iter_forward();
__syncthreads();
}

for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
sum[o] += __shfl_xor(sum[o], i, 32);
}
}

if (threadIdx.x == 0) {
#pragma unroll
for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
int out_h_idx = out_base_h_idx + i;
if (out_h_idx < param.out_h) {
#pragma unroll
for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
int out_w_idx = out_start_w + j;
if (out_w_idx >= param.out_w)
return;
out_base_ptr[out_h_idx * param.out_w + out_w_idx] =
sum[i * OutTileConfig::unroll_w + j];
}
}
}
}
}

template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
__global__ void DepthwiseConv2dGPUKernelNCHWC32(
const Param param, const float* input, const float* filter, float* output) {
using T = float;
using T2 = float2;
using ThreadConfig = typename ConvTrait::ThreadConfig;
using SrcTileConfig = typename ConvTrait::SrcTileConfig;
using FilterTileConfig = typename ConvTrait::FilterTileConfig;
using OutTileConfig = typename ConvTrait::OutTileConfig;
using SrcTileCount = typename ConvTrait::SrcTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
off_oh = threadIdx.y, off_ow = threadIdx.x;

extern __shared__ __align__(8) unsigned char smem[];
static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
T* smem_src = reinterpret_cast<T*>(smem);
T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
int stride_h = is_fwd ? param.stride_h : 1;
int stride_w = is_fwd ? param.stride_w : 1;

int off_ichannel = off_ochannel / param.chl_mul,
off_fchannel = off_ichannel % param.src_chl,
out_start_h = off_obh * OutTileConfig::block_h,
out_start_w = off_obw * OutTileConfig::block_w,
src_start_h = out_start_h * stride_h - param.pad_h,
src_start_w = out_start_w * stride_w - param.pad_w,
out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

SrcGlobal2ShareVisitor gl2sh_src = {
smem_src,
param.src_w,
is_fwd ? src_start_h
: src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
param.src_h * param.stride_h / 2),
is_fwd ? src_start_w
: src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
param.src_w * param.stride_w / 2),
is_fwd ? param.src_h : param.src_h * param.stride_h,
is_fwd ? param.src_w : param.src_w * param.stride_w,
is_fwd ? 1 : param.stride_h,
is_fwd ? 1 : param.stride_w};

FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
param.flt_w,
is_fwd ? 0 : param.flt_h - 2,
0,
param.flt_h,
param.flt_w,
1,
1};

gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

gl2sh_src.first_copy();
gl2sh_flt.first_copy();

__syncthreads();

T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w],
reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w];

T sum[OutTileConfig::unroll_size] = {0.0};

for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
gl2sh_src.copy();
gl2sh_flt.copy();
#pragma unroll
for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
#pragma unroll
for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) {
reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr
[(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
SrcTileCount::smem_w +
s_w];
}
}

#pragma unroll
for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
#pragma unroll
for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) {
reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr
[(fh + f_h) % FilterTileCount::smem_h *
FilterTileCount::smem_w +
f_w];
}
}

#pragma unroll
for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
#pragma unroll
for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
#pragma unroll
for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) {
#pragma unroll
for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
sum[oh * OutTileConfig::unroll_w + ow] +=
reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] *
reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw +
ow * stride_w];
}
}
}
}

__syncthreads();
gl2sh_src.commit();
gl2sh_flt.commit();
gl2sh_src.iter_forward();
gl2sh_flt.iter_forward();
__syncthreads();
}

for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
sum[o] += __shfl_xor(sum[o], i, 32);
}
}

if (threadIdx.x == 0) {
#pragma unroll
for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
int out_h_idx = out_base_h_idx + i;
if (out_h_idx < param.out_h) {
#pragma unroll
for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
int out_w_idx = out_start_w + j;
if (out_w_idx >= param.out_w)
return;
out_base_ptr[out_h_idx * param.out_w + out_w_idx] =
sum[i * OutTileConfig::unroll_w + j];
}
}
}
}
}

template <
typename T, typename T2, DepthwiseConv2dDirection kDirection, int unroll_fw,
int unroll_ow, int stride>
void LaunchDepthwiseConv2dGPU(
const Param& param, const T* input, const T* filter, T* output,
cudaStream_t stream) {
static int const unroll_oh = 1, unroll_fh = 1;

using FilterTileConfig = FilterTileConfig<unroll_fh, unroll_fw>;
using ThreadConfig = ThreadConfig<4, 32>;
using OutTileConfig = OutTileConfig<ThreadConfig, unroll_oh, unroll_ow>;
using IConvTrait = ConvTrait<
T, kDirection, ThreadConfig, OutTileConfig, FilterTileConfig, stride,
stride>;
using SrcTileCount = typename IConvTrait::SrcTileCount;
using FilterTileCount = typename IConvTrait::FilterTileCount;

dim3 block(ThreadConfig::thread_x, ThreadConfig::thread_y);
dim3 grid;
grid.x = param.batch * param.src_chl * param.chl_mul;
grid.y = DIVUP(param.out_w, OutTileConfig::block_w);
grid.z = DIVUP(param.out_h, OutTileConfig::block_h);
const int shared_storage =
(SrcTileCount::smem_size + FilterTileCount::smem_size) * sizeof(T);

void (*kernel)(const Param, const T*, const T*, T*);

if (param.is_compute_deafult) {
kernel = DepthwiseConv2dGPUKernelNCHW<IConvTrait, kDirection>;
} else {
kernel = DepthwiseConv2dGPUKernelNCHWC32<IConvTrait, kDirection>;
}
kernel<<<grid, block, shared_storage, stream>>>(param, input, filter, output);
after_kernel_launch();
}

#define INSTANCE_AB(type1, type2, a, b, direction) \
if (param.out_w > b * 4) { \
if (direction == DepthwiseConv2dDirection::DIRECTION_BACKWARD || \
(param.stride_h == 1 && param.stride_w == 1)) { \
LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 1>( \
param, src, flt, dst, stream); \
} else if (param.stride_h == 2 && param.stride_w == 2) { \
LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 2>( \
param, src, flt, dst, stream); \
} \
}

#define INSTANCE_A(type1, type2, a, direction) \
if (param.flt_w > a * 4) { \
INSTANCE_AB(type1, type2, a, 15, direction) \
else INSTANCE_AB(type1, type2, a, 14, direction) else INSTANCE_AB(type1, type2, a, 13, direction) else INSTANCE_AB(type1, type2, a, 12, direction) else INSTANCE_AB(type1, type2, a, 11, direction) else INSTANCE_AB(type1, type2, a, 10, direction) else INSTANCE_AB( \
type1, type2, \
a, 9, direction) else INSTANCE_AB(type1, type2, a, 8, direction) else INSTANCE_AB(type1, type2, a, 7, direction) else INSTANCE_AB(type1, type2, a, 6, direction) else INSTANCE_AB(type1, type2, a, 5, direction) else INSTANCE_AB(type1, type2, a, 4, direction) else INSTANCE_AB(type1, type2, a, 3, direction) else INSTANCE_AB(type1, type2, a, 2, direction) else INSTANCE_AB(type1, type2, a, 1, direction) else INSTANCE_AB(type1, type2, a, 0, direction) \
}

#define INSTANCE(type1, type2, direction) \
INSTANCE_A(type1, type2, 6, direction) \
else INSTANCE_A(type1, type2, 4, direction) else INSTANCE_A( \
type1, type2, 2, direction) else INSTANCE_A(type1, type2, 0, direction)
} // anonymous namespace

+ 56
- 0
dnn/src/cuda/conv_bias/chanwise/fwd_large_filter.cu View File

@@ -0,0 +1,56 @@
/**
* \file dnn/src/cuda/conv_bias/chanwise/fwd_depthwise_large_filter.cu
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "cuda.h"
#include "cuda_fp16.h"
#include "src/cuda/conv_bias/chanwise/kern.cuh"
#include "src/cuda/conv_bias/chanwise/kern_helper.cuh"
#include "src/cuda/conv_bias/chanwise/launch_config.cuh"
#include "src/cuda/fp16_help.cuh"

using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
using namespace chanwise;

#include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh"

namespace megdnn {
namespace cuda {
namespace conv_bias {
namespace chanwise {

// =====================================fwd=====================================

#define check

template <>
void run_fwd_depthwise_large_filter(
float* dst, const float* src, const float* flt, const Param& param,
cudaStream_t stream) {
INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_FORWARD)
}

#if CUDA_VERSION >= 9000
template <>
void run_fwd_depthwise_large_filter(
__half* dst, const __half* src, const __half* flt, const Param& param,
cudaStream_t stream) {
INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_FORWARD)
}
#endif

} // namespace chanwise
} // namespace conv_bias
} // namespace cuda
} // namespace megdnn

// vim: syntax=cuda.doxygen

+ 13
- 6
dnn/src/cuda/conv_bias/chanwise/kern.cuh View File

@@ -27,8 +27,10 @@ namespace chanwise {
struct Param {
uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
pad_w, stride_h, stride_w, dilation_h, dilation_w;
bool is_compute_deafult;
#if MEGDNN_CC_HOST
static Param from_fwd_args(const BiasForwardSizeArgs& args) {
static Param from_fwd_args(
const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) {
#define U(v) static_cast<uint32_t>(v)
auto&& src = args.src_layout->shape;
auto&& dst = args.dst_layout->shape;
@@ -42,11 +44,12 @@ struct Param {
hw_pos = 1;
}
return {
U(src[0]), U(src[c_pos]), U(src[hw_pos]),
U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]),
U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]),
U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]),
U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]),
U(src[0]), U(src[c_pos]), U(src[hw_pos]),
U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]),
U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]),
U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]),
U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]),
is_compute_deafult_,
};
#undef U
}
@@ -61,6 +64,10 @@ template <typename T>
void run_fwd_small(
T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

template <typename T>
void run_fwd_depthwise_large_filter(
T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

// implemented in fwd_8x8x32.cu
void run_fwd_8x8x32(
int32_t* dst, const int8_t* src, const int8_t* flt, const Param& param,


+ 141
- 12
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp View File

@@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) {
return cutlass::conv::ConvType::kLocal;
case Base::ConvType::kLocalShare:
return cutlass::conv::ConvType::kLocalShare;
case Base::ConvType::kDepthwiseConvolution:
return cutlass::conv::ConvType::kDepthwiseConvolution;
default:
megdnn_assert(0, "invalid conv type");
}
}

NumericTypeID convert_dtype(DTypeEnum dtype) {
switch (dtype) {
NumericTypeID convert_dtype(DType dtype) {
// just make convolution with no bias happy
if (!dtype.valid())
return NumericTypeID::kF32;
switch (dtype.enumv()) {
case DTypeEnum::Float32:
return NumericTypeID::kF32;
case DTypeEnum::Float16:
@@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) {
}
}

NumericTypeID get_accumulator_dtype(
DType dtype, const param::ConvBias::ComputeMode comp_mode) {
if (dtype.category() == DTypeCategory::QUANTIZED) {
return NumericTypeID::kS32;
} else {
megdnn_assert(dtype.category() == DTypeCategory::FLOAT);
if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) {
return convert_dtype(dtype);
} else {
megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32);
return NumericTypeID::kF32;
}
}
}

struct LayoutPack {
LayoutTypeID src;
LayoutTypeID filter;
@@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type
default:
megdnn_assert(0, "invalid access_type");
}
case Format::NCHW:
return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW,
LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW};
default:
megdnn_assert(0, "invalid format");
}
@@ -177,6 +200,94 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla
megdnn_assert(0, "invalid nonlinear mode");
}

std::pair<int, int> get_tensor_alignment(
const param::ConvBias::Format format, const TensorLayout& src,
const TensorLayout& filter, const Base::AlgoParam& algo_param,
bool is_chanwise) {
int alignment_src = 0;
int alignment_filter = 0;

using Format = param::ConvBias::Format;

// get tensor alignment for tensor op operations
// for tensor op operations, the alignment is determined by the size of a vector
auto get_tensor_alignment_tensor_op = [&]() {
switch (format) {
/// case int8
case Format::NCHW32:
case Format::NCHW32_NCHW4:
alignment_src = 16;
alignment_filter = 16;
break;
/// case int4 or uint4
case Format::NCHW64:
alignment_src = 32;
alignment_filter = 32;
break;
case Format::NHWC:
alignment_src = alignment_filter = algo_param.access_size;
break;
default:
megdnn_throw("invalid format");
};
};

// get tensor alignment for dot product operations
// for integer dot product operations, alignment src is always 4
// and the alignment filter is determined by the threadblock shape
auto get_tensor_alignment_dp4a = [&]() {
megdnn_assert(
format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32);
alignment_src = 4;
// determine alignment filter
constexpr int warp_size = 32;
int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n *
algo_param.threadblock_k /
(algo_param.warp_m * algo_param.warp_n * algo_param.warp_k);
int threadblock_loads =
filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k);
int load_per_thread = threadblock_loads / threads;
if (load_per_thread >= 16)
alignment_filter = 16;
else if (load_per_thread >= 8)
alignment_filter = 8;
else {
megdnn_assert(load_per_thread >= 4);
alignment_filter = 4;
}
};

// get tensor alignment for depthwise convolution
auto get_tensor_alignment_dwconv2d_nchw = [&]() {
alignment_filter = 1;
size_t wi = src.dtype.size(src[3]); // width extent in bytes
for (size_t candidate : {16, 4, 2}) {
if (wi % candidate == 0) {
alignment_src = candidate;
break;
}
}
alignment_src /= src.dtype.size(1);
};

/// TODO: need a better way to check whether tensor core instruction is used
if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 ||
format == Format::NCHW64 || format == Format::NCHW64 ||
format == Format::NHWC) {
get_tensor_alignment_tensor_op();
} else if (
format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) {
get_tensor_alignment_dp4a();
} else {
/// the following is used for depthwise convolution
megdnn_assert(format == Format::NCHW && is_chanwise);
get_tensor_alignment_dwconv2d_nchw();
}
megdnn_assert(alignment_src >= 1 && alignment_filter >= 1);
return {alignment_src, alignment_filter};
}
} // namespace

const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
@@ -185,23 +296,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
auto&& param = args.opr->param();
auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
auto epilogue_type = get_epilogue_type(
param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32);
param.nonlineMode,
args.dst_layout->dtype.category() != DTypeCategory::FLOAT);

cutlass::conv::SpecialOptimizeDesc special_optimization =
(use_conv_filter_unity_opt)
? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
: cutlass::conv::SpecialOptimizeDesc::NONE;

int alignment_src, alignment_filter;
auto&& fm = args.filter_meta;
bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 &&
fm.ocpg == 1;
std::tie(alignment_src, alignment_filter) = get_tensor_alignment(
param.format, *args.src_layout, *args.filter_layout, m_algo_param,
is_chanwise);

auto accumulator_dtype =
get_accumulator_dtype(args.src_layout->dtype, param.compute_mode);

ConvolutionKey key{
convert_conv_op(conv_op),
convert_dtype(args.src_layout->dtype.enumv()),
convert_dtype(args.src_layout->dtype),
layouts.src,
convert_dtype(args.filter_layout->dtype.enumv()),
convert_dtype(args.filter_layout->dtype),
layouts.filter,
convert_dtype(args.dst_layout->dtype.enumv()),
convert_dtype(args.dst_layout->dtype),
layouts.dst,
convert_dtype(args.bias_layout->dtype.enumv()),
convert_dtype(args.bias_layout->dtype),
layouts.bias,
accumulator_dtype,
convert_conv_type(conv_type),
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -215,6 +339,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
epilogue_type,
m_algo_param.stage,
special_optimization,
alignment_src,
alignment_filter,
without_shared_load};

return Singleton::get().operation_table.find_op(key);
@@ -227,13 +353,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op(
size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha,
const void* beta, const void* gamma, const void* delta, const void* theta,
const void* threshold, const void* dst_scale, cudaStream_t stream,
const void* extra_param) const {
const void* extra_param, size_t groups) const {
// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass::conv::Conv2dProblemSize problem_size{
int(n), int(hi), int(wi), int(ci),
int(co), int(fh), int(fw), int(ho),
int(wo), int(ph), int(pw), int(sh),
int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation};
int(n), int(hi), int(wi), int(ci),
int(co), int(fh), int(fw), int(ho),
int(wo), int(ph), int(pw), int(sh),
int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
int(groups), // groups
};

ConvolutionArguments conv_args{
problem_size, src, filter, bias, z, dst, alpha,


+ 111
- 0
dnn/src/cuda/conv_bias/depthwise_large_filter.cpp View File

@@ -0,0 +1,111 @@
/**
* \file dnn/src/cuda/conv_bias/depthwise_large_filter.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh"
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/conv_bias/chanwise/kern.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace conv_bias;

namespace {
inline bool is_available_depthwise_large_filter(const chanwise::Param& param) {
if ((param.stride_h == 1 && param.stride_w == 1) ||
(param.stride_h == 2 && param.stride_w == 2)) {
auto&& device_prop = cuda::current_device_prop();
static int const unroll_oh = 1, unroll_fh = 1;
CHECK(FWD)
}
return false;
}
} // anonymous namespace

bool ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::is_available(
const SizeArgs& args) const {
if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) {
return false;
}
if (args.src_layout->dtype != args.filter_layout->dtype &&
(args.src_layout->dtype != dtype::Float32()
#if CUDA_VERSION >= 9000
|| args.src_layout->dtype != dtype::Float16()
#endif
)) {
return false;
}
if (args.z_layout->ndim > 0)
return false;

auto param = chanwise::Param::from_fwd_args(
args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT);
auto&& fm = args.filter_meta;
return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW &&
args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode == Param::ComputeMode::DEFAULT &&
fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 &&
fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip &&
is_available_depthwise_large_filter(param);
}

size_t ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes(
const SizeArgs& args) const {
auto dst_layout = *args.dst_layout;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
return dst_layout.span().dist_byte();
}
return 0;
}

void ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::exec(const ExecArgs& args) const {
WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor = TensorND{bundle.get(0), conv_dst_tensor.layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
conv_dst_tensor.layout.dtype);
}
{
auto kparam = chanwise::Param::from_fwd_args(
args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT);
auto stream = cuda_stream(args.handle);
switch (args.src_layout->dtype.enumv()) {
case DTypeEnum::Float32:
chanwise::run_fwd_depthwise_large_filter(
conv_dst_tensor.ptr<float>(), args.src_tensor->ptr<float>(),
args.filter_tensor->ptr<float>(), kparam, stream);
break;
#if CUDA_VERSION >= 9000
case DTypeEnum::Float16:
chanwise::run_fwd_depthwise_large_filter(
static_cast<half*>(conv_dst_tensor.raw_ptr()),
static_cast<half*>(args.src_tensor->raw_ptr()),
static_cast<half*>(args.filter_tensor->raw_ptr()), kparam,
stream);
break;
#endif
default:
megdnn_assert_internal(0);
}
}
handle_bias_and_nonlinear(
args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor,
args.bias_tensor);
}

// vim: syntax=cpp.doxygen

+ 99
- 0
dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp View File

@@ -0,0 +1,99 @@
/**
* \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;

bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(7, 0));
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
using Param = param::ConvBias;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
using NonlineMode = Param::NonlineMode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float16 &&
check_bias_share_in_channel(*args.bias_layout, param.format)));
RETURN_IF_FALSE(
args.z_layout->ndim <= 0 ||
args.z_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
wi = args.src_layout->operator[](3);
size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
size_t co = fm.group;
size_t ci = co;
// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

// dummy parameters, used for quantization cases
float theta = 0.f;
float delta = 0.f;
float threshold = 0.f;

const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

UNPACK_CONV_PARAMETER(fm, param);
MARK_USED_VAR
execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, nullptr, stream, nullptr, fm.group);

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 99
- 0
dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp View File

@@ -0,0 +1,99 @@
/**
* \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;

bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(6, 1));
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
using Param = param::ConvBias;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
using NonlineMode = Param::NonlineMode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
check_bias_share_in_channel(*args.bias_layout, param.format)));
RETURN_IF_FALSE(
args.z_layout->ndim <= 0 ||
args.z_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
wi = args.src_layout->operator[](3);
size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
size_t co = fm.group;
size_t ci = co;
// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

// dummy parameters, used for quantization cases
float theta = 0.f;
float delta = 0.f;
float threshold = 0.f;

const auto* op = get_cutlass_conv_op(
args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

UNPACK_CONV_PARAMETER(fm, param);
MARK_USED_VAR
execute_cutlass_conv_op(
op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
&threshold, nullptr, stream, nullptr, fm.group);

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 14
- 1
dnn/src/cuda/conv_bias/opr_impl.cpp View File

@@ -145,9 +145,22 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl ||
args.filter_meta.stride[0] != 1 ||
args.filter_meta.stride[1] != 1 || hw_size < 512;
//! choose for large kernel cases
size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
size_t hi = src[2], wi = src[3];
const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw;
//! avoid bad case in cudnn, check dnn chanwise impl first
if (is_chanwise) {
if (prefer_dnn_chanwise) {
if (prefer_dnn_lk_implbmm) {
#if CUDA_VERSION >= 10020
if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.f16_implicit_bmm[0];
#endif
if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.f32_implicit_bmm[0];
} else if (prefer_dnn_chanwise) {
if (sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.chanwise;


+ 4
- 0
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -45,6 +45,7 @@ public:
class AlgoCUDNNConvBiasActivation;
class AlgoChanwise;
class AlgoChanwiseSmall;
class AlgoDepthwiseLargeFilter;
class AlgoChanwise8x8x32;
class AlgoCUDNNConv;
class AlgoFallbackNCHWQS8;
@@ -71,6 +72,9 @@ public:
class AlgoInt4Int4NHWCIMMAImplicitGemm;
class AlgoUInt4Int4NHWCIMMAImplicitGemm;
class AlgoBFloat16;
// The following algorithms are suitable for channel wise convolution
class AlgoFloat32NCHWFMAImplicitBatchedGemm;
class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

class AlgoPack;



+ 36
- 0
dnn/src/cuda/convolution/backward_data/algo.cpp View File

@@ -19,10 +19,12 @@ using namespace cuda;
ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
non_cudnn_algos.push_back(&chanwise);
non_cudnn_algos.push_back(&chanwise_small);
non_cudnn_algos.push_back(&depthwise_large_filter);
non_cudnn_algos.push_back(&matmul);

all_algos.push_back(&chanwise); // prefer chanwise
all_algos.push_back(&chanwise_small); // prefer small chanwise
all_algos.push_back(&depthwise_large_filter);

fill_cudnn_algos();
for (auto&& i : cudnn) {
@@ -41,6 +43,7 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&algo);
int8_algos.push_back(&algo);
}
fill_dwconv_algos();

int8_algos.push_back(&int8_nchw_dotprod);
all_algos.push_back(&int8_nchw_dotprod);
@@ -54,6 +57,39 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
}
}

void ConvolutionBackwardDataImpl::AlgoPack::fill_dwconv_algos() {
{
using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam;
/// preferred algo
implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2});
for (auto&& algo : implbmm_nchw_fma) {
all_algos.push_back(&algo);
}
}
#if CUDA_VERSION >= 10010
{
using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam;
/// preferred algo
implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
for (auto&& algo : implbmm_nchw_hmma) {
all_algos.push_back(&algo);
}
}
#endif
}

MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl)

ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack::


+ 98
- 1
dnn/src/cuda/convolution/backward_data/algo.h View File

@@ -37,11 +37,14 @@ public:
CUDA_MATMUL,
CUDA_CHANWISE,
CUDA_CHANWISE_SMALL,
CUDA_DEPTHWISE_LARGE_FILTER,
CUDA_BFLOAT16,
CUDA_GROUP_CONV_GENERAL,
CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8,
CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8
CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8,
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -190,6 +193,20 @@ public:
}
};

class ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;

const char* name() const override { return "DEPTHWISE_LARGE_FILTER"; }
MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER)
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }

private:
mutable std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
@@ -315,6 +332,82 @@ private:
std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
: public AlgoBase {
public:
struct AlgoParam {
int threadblock_m;
int threadblock_n;
int threadblock_k;
int warp_m;
int warp_n;
int warp_k;
int stage;
std::string to_string() {
return ssprintf(
"_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
threadblock_k, warp_m, warp_n, warp_k, stage);
}
};
AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
: m_algo_param{algo_param},
m_name{ssprintf(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

private:
const void* get_available_op(const SizeArgs& args) const;
AlgoParam m_algo_param;
std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
: public AlgoBase {
public:
/// add instruction shape as member of algo param, because f16 tensor core has 2
/// different matrix shapes (i.e. mma.884 and mma.1688)
struct AlgoParam {
int threadblock_m;
int threadblock_n;
int threadblock_k;
int warp_m;
int warp_n;
int warp_k;
int instruction_m;
int instruction_n;
int instruction_k;
int stage;
std::string to_string() {
return ssprintf(
"_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
instruction_n, instruction_k, stage);
}
};
AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
: m_algo_param{algo_param},
m_name{ssprintf(
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

private:
const void* get_available_op(const SizeArgs& args) const;
AlgoParam m_algo_param;
std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
// defined in cudnn.cpp
void fill_cudnn_algos();
@@ -322,6 +415,7 @@ class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
void fill_int8_dp4a_algos();
// defined in implicit_gemm_int8_nhwc_imma.cpp
void fill_int8_imma_algos();
void fill_dwconv_algos();

AlgoBase::Mapper m_all_algos_map;

@@ -332,11 +426,14 @@ public:
AlgoMatmul matmul;
AlgoChanwise chanwise;
AlgoChanwiseSmall chanwise_small;
AlgoDepthwiseLargeFilter depthwise_large_filter;
AlgoBFloat16 bfloat16;
AlgoGroupConvGeneral group;
std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;

std::vector<AlgoBase*>
//! all algorithms


+ 89
- 0
dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp View File

@@ -0,0 +1,89 @@
/**
* \file dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh"
#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/convolution/chanwise/kern.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace convolution;

namespace {
inline bool is_available_depthwise_large_filter(const chanwise::Param& param) {
if ((param.stride_h == 1 && param.stride_w == 1) ||
(param.stride_h == 2 && param.stride_w == 2)) {
auto&& device_prop = cuda::current_device_prop();
static int const unroll_oh = 1, unroll_fh = 1;
CHECK(BWD)
}
return false;
}
} // anonymous namespace

bool ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::is_available(
const SizeArgs& args) const {
if (!args.grad_layout->is_contiguous() || !args.diff_layout->is_contiguous()) {
return false;
}
if (args.diff_layout->dtype != args.filter_layout->dtype &&
(args.diff_layout->dtype != dtype::Float32()
#if CUDA_VERSION >= 9000
|| args.diff_layout->dtype != dtype::Float16()
#endif
)) {
return false;
}

auto param = chanwise::Param::from_fwd_args(
{args.handle, args.diff_layout, args.filter_layout, args.filter_meta,
args.grad_layout});
auto&& fm = args.filter_meta;
return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode == Param::ComputeMode::DEFAULT &&
fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 &&
fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip &&
is_available_depthwise_large_filter(param);
}

size_t ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes(
const SizeArgs& args) const {
return 0;
}

void ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::exec(
const ExecArgs& args) const {
auto kparam = chanwise::Param::from_fwd_args(
{args.handle, args.diff_layout, args.filter_layout, args.filter_meta,
args.grad_layout});
auto stream = cuda_stream(args.handle);
switch (args.diff_layout->dtype.enumv()) {
case DTypeEnum::Float32:
chanwise::run_bwd_depthwise_large_filter(
args.grad_tensor->ptr<float>(), args.diff_tensor->ptr<float>(),
args.filter_tensor->ptr<float>(), kparam, stream);
break;
#if CUDA_VERSION >= 9000
case DTypeEnum::Float16:
chanwise::run_bwd_depthwise_large_filter(
static_cast<half*>(args.grad_tensor->raw_ptr()),
static_cast<half*>(args.diff_tensor->raw_ptr()),
static_cast<half*>(args.filter_tensor->raw_ptr()), kparam, stream);
break;
#endif
default:
megdnn_assert_internal(0);
}
}

// vim: syntax=cpp.doxygen

+ 149
- 0
dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp View File

@@ -0,0 +1,149 @@
/**
* \file
* dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/cutlass/singleton.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace cutlass::library;

const void* ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
get_available_op(const SizeArgs& args) const {
int alignment_diff = 0;
int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3));
for (int candidate : {16, 4, 2}) {
if (wo % candidate == 0) {
alignment_diff = candidate;
break;
}
}
alignment_diff /= args.diff_layout->dtype.size(1);
NumericTypeID accumulator_dtype =
args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT
? NumericTypeID::kF16
: NumericTypeID::kF32;
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kF16,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF16,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF16,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF16,
LayoutTypeID::kTensorNCHW,
accumulator_dtype,
cutlass::conv::ConvType::kDepthwiseConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
m_algo_param.instruction_m,
m_algo_param.instruction_n,
m_algo_param.instruction_k,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombination,
m_algo_param.stage,
cutlass::conv::SpecialOptimizeDesc::NONE,
alignment_diff,
1,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(7, 0));
RETURN_IF_FALSE(
args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous());
using Param = param::Convolution;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.diff_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.grad_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_available_op(args);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
wo = args.diff_layout->operator[](3);
int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3);
int co = fm.group, ci = co, groups = co;
int fh = fm.spatial[0], fw = fm.spatial[1];
int sh = fm.stride[0], sw = fm.stride[1];
int ph = fm.padding[0], pw = fm.padding[1];
int dh = param.dilate_h, dw = param.dilate_w;

// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = 0.f;
float gamma = 0.f;
float delta = 0.f;

const Operation* op = (const Operation*)get_available_op(args);

cutlass::conv::Conv2dProblemSize problem_size{
n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
groups, // groups
};

cutlass::library::ConvolutionArguments conv_args{
problem_size,
args.diff_tensor->raw_ptr(),
args.filter_tensor->raw_ptr(),
nullptr,
nullptr,
args.grad_tensor->raw_ptr(),
&alpha,
&beta,
&gamma,
&delta,
nullptr,
nullptr,
nullptr,
nullptr};

cutlass_check(op->run(&conv_args, nullptr, stream));

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 145
- 0
dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp View File

@@ -0,0 +1,145 @@
/**
* \file
* dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/cutlass/singleton.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace cutlass::library;

const void* ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::
get_available_op(const SizeArgs& args) const {
int alignment_diff = 0;
int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3));
for (int candidate : {16, 4}) {
if (wo % candidate == 0) {
alignment_diff = candidate;
break;
}
}
alignment_diff /= args.diff_layout->dtype.size(1);
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kF32,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF32,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF32,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF32,
LayoutTypeID::kTensorNCHW,
NumericTypeID::kF32,
cutlass::conv::ConvType::kDepthwiseConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
cutlass::epilogue::EpilogueType::kBiasAddLinearCombination,
m_algo_param.stage,
cutlass::conv::SpecialOptimizeDesc::NONE,
alignment_diff,
1,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(6, 1));
RETURN_IF_FALSE(
args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous());
using Param = param::Convolution;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.diff_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.grad_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_available_op(args);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
wo = args.diff_layout->operator[](3);
int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3);
int co = fm.group, ci = co, groups = co;
int fh = fm.spatial[0], fw = fm.spatial[1];
int sh = fm.stride[0], sw = fm.stride[1];
int ph = fm.padding[0], pw = fm.padding[1];
int dh = param.dilate_h, dw = param.dilate_w;

// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = 0.f;
float gamma = 0.f;
float delta = 0.f;

const Operation* op = (const Operation*)get_available_op(args);

cutlass::conv::Conv2dProblemSize problem_size{
n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
groups, // groups
};

cutlass::library::ConvolutionArguments conv_args{
problem_size,
args.diff_tensor->raw_ptr(),
args.filter_tensor->raw_ptr(),
nullptr,
nullptr,
args.grad_tensor->raw_ptr(),
&alpha,
&beta,
&gamma,
&delta,
nullptr,
nullptr,
nullptr,
nullptr};

cutlass_check(op->run(&conv_args, nullptr, stream));

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 16
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
(sh == 2 && sw == 2)
? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING
: cutlass::conv::SpecialOptimizeDesc::NONE;
int alignment_filter = 4;
constexpr int warp_size = 32;
int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n *
m_algo_param.threadblock_k /
(m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k);
int threadblock_loads = args.filter_layout->dtype.size(
m_algo_param.threadblock_m * m_algo_param.threadblock_k);
int load_per_thread = threadblock_loads / threads;
if (load_per_thread >= 16)
alignment_filter = 16;
else if (load_per_thread >= 8)
alignment_filter = 8;
megdnn_assert(load_per_thread >= 4);
ConvolutionKey key{
cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8,
@@ -39,6 +52,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -52,6 +66,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
special_optimization,
4,
alignment_filter,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 3
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp View File

@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
LayoutTypeID::kTensorNC4HW4,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
16,
64,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
2,
special_optimization,
4,
4,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 3
- 0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp View File

@@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
LayoutTypeID::kTensorNHWC,
NumericTypeID::kS32,
LayoutTypeID::kTensorNHWC,
NumericTypeID::kS32,
cutlass::conv::ConvType::kConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
@@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
m_algo_param.stage,
special_optimization,
m_algo_param.access_size,
m_algo_param.access_size,
false};
return (void*)Singleton::get().operation_table.find_op(key);
}


+ 34
- 0
dnn/src/cuda/convolution/backward_filter/algo.cpp View File

@@ -25,6 +25,7 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() {
for (auto&& i : cudnn) {
all_algos.push_back(&i);
}
fill_dwconv_algos();
all_algos.push_back(&matmul);
all_algos.push_back(&group);

@@ -48,6 +49,39 @@ ConvolutionBackwardFilterImpl::AlgoCUDNN* ConvolutionBackwardFilterImpl::AlgoPac
"can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo)));
}

void ConvolutionBackwardFilterImpl::AlgoPack::fill_dwconv_algos() {
{
using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam;
/// preferred algo
implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2});
implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2});
for (auto&& algo : implbmm_nchw_fma) {
all_algos.push_back(&algo);
}
}
#if CUDA_VERSION >= 10010
{
using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam;
/// preferred algo
implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
for (auto&& algo : implbmm_nchw_hmma) {
all_algos.push_back(&algo);
}
}
#endif
}

ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack;

ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(


+ 81
- 0
dnn/src/cuda/convolution/backward_filter/algo.h View File

@@ -37,6 +37,8 @@ public:
CUDA_CHANWISE,
CUDA_BFLOAT16,
CUDA_GROUP_CONV_GENERAL,
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -210,9 +212,86 @@ private:
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

class ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
: public AlgoBase {
public:
struct AlgoParam {
int threadblock_m;
int threadblock_n;
int threadblock_k;
int warp_m;
int warp_n;
int warp_k;
int stage;
std::string to_string() {
return ssprintf(
"_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
threadblock_k, warp_m, warp_n, warp_k, stage);
}
};
AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
: m_algo_param{algo_param},
m_name{ssprintf(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

private:
const void* get_available_op(const SizeArgs& args) const;
AlgoParam m_algo_param;
std::string m_name;
};

class ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
: public AlgoBase {
public:
/// add instruction shape as member of algo param, because f16 tensor core has 2
/// different matrix shapes (i.e. mma.884 and mma.1688)
struct AlgoParam {
int threadblock_m;
int threadblock_n;
int threadblock_k;
int warp_m;
int warp_n;
int warp_k;
int instruction_m;
int instruction_n;
int instruction_k;
int stage;
std::string to_string() {
return ssprintf(
"_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
instruction_n, instruction_k, stage);
}
};
AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
: m_algo_param{algo_param},
m_name{ssprintf(
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

private:
const void* get_available_op(const SizeArgs& args) const;
AlgoParam m_algo_param;
std::string m_name;
};

class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj {
// defined in cudnn.cpp
void fill_cudnn_algos();
void fill_dwconv_algos();

AlgoBase::Mapper m_all_algos_map;

@@ -224,6 +303,8 @@ public:
AlgoChanwise chanwise;
AlgoGroupConvGeneral group;
AlgoBFloat16 bfloat16;
std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;

std::vector<AlgoBase*>
//! all algorithms


+ 172
- 0
dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp View File

@@ -0,0 +1,172 @@
/**
* \file
* dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/cuda/convolution/backward_filter/algo.h"
#include "src/cuda/cutlass/singleton.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace cutlass::library;

const void* ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
get_available_op(const SizeArgs& args) const {
auto get_alignment = [](const TensorLayout& layout) {
int alignment = 0;
int width = layout.dtype.size(layout[3]);
for (int candidate : {16, 4, 2}) {
if (width % candidate == 0) {
alignment = candidate;
break;
}
}
alignment /= layout.dtype.size(1);
return alignment;
};
int alignment_src = get_alignment(*args.src_layout);
int alignment_diff = get_alignment(*args.diff_layout);
megdnn_assert(alignment_src >= 1 && alignment_diff >= 1);
NumericTypeID accumulator_dtype =
args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT
? NumericTypeID::kF16
: NumericTypeID::kF32;
ConvolutionKey key{
cutlass::conv::Operator::kWgrad,
NumericTypeID::kF16, // src tensor data type
LayoutTypeID::kTensorNCHW, // src tensor layout
NumericTypeID::kF16, // diff tensor data type
LayoutTypeID::kTensorNCHW, // diff tensor layout
NumericTypeID::kF32, // grad tensor data type
LayoutTypeID::kTensorNCHW, // grad tensor layout
NumericTypeID::kF32, // dummy argument, not used.
LayoutTypeID::kTensorNCHW, // dummy argument, not used
accumulator_dtype,
cutlass::conv::ConvType::kDepthwiseConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
m_algo_param.instruction_m,
m_algo_param.instruction_n,
m_algo_param.instruction_k,
cutlass::epilogue::EpilogueType::kLinearCombination, // no bias
m_algo_param.stage,
cutlass::conv::SpecialOptimizeDesc::NONE,
alignment_src,
alignment_diff,
true};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
is_available(const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(7, 0));
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() &&
args.grad_layout->is_contiguous());
using Param = param::Convolution;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
using ComputeMode = Param::ComputeMode;
auto&& param = args.opr->param();
auto&& fm = args.grad_filter_meta;
RETURN_IF_FALSE(param.compute_mode == ComputeMode::FLOAT32);
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.diff_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.grad_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_available_op(args);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

size_t ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
get_workspace_in_bytes(const SizeArgs& args) const {
auto layout = *args.grad_layout;
// modify data type
layout.modify_dtype_inplace(dtype::Float32());
return layout.span().dist_byte();
}

void ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.grad_filter_meta;
int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3);
int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
wo = args.diff_layout->operator[](3);
int co = fm.group, ci = co, groups = co;
int fh = fm.spatial[0], fw = fm.spatial[1];
int sh = fm.stride[0], sw = fm.stride[1];
int ph = fm.padding[0], pw = fm.padding[1];
int dh = param.dilate_h, dw = param.dilate_w;

// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = 0.f;

const Operation* op = (const Operation*)get_available_op(args);

cutlass::conv::Conv2dProblemSize problem_size{
n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
groups, // groups
};

cutlass::library::ConvolutionArguments conv_args{
problem_size,
args.src_tensor->raw_ptr(),
args.diff_tensor->raw_ptr(),
nullptr,
nullptr,
args.workspace.raw_ptr,
&alpha,
&beta,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr};

cutlass_check(op->run(&conv_args, nullptr, stream));

after_kernel_launch();

auto&& typecvt = args.opr->handle()->create_operator<TypeCvt>();
auto f32_grad_layout = *args.grad_layout;
// modify data type
f32_grad_layout.modify_dtype_inplace(dtype::Float32());
TensorND src{args.workspace.raw_ptr, f32_grad_layout},
dst{args.grad_tensor->raw_ptr(), *args.grad_layout};
typecvt->exec(src, dst);
}

// vim: syntax=cpp.doxygen

+ 135
- 0
dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp View File

@@ -0,0 +1,135 @@
/**
* \file
* dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/cuda/convolution/backward_filter/algo.h"
#include "src/cuda/cutlass/singleton.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace cutlass::library;

const void* ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::
get_available_op(const SizeArgs& args) const {
ConvolutionKey key{
cutlass::conv::Operator::kWgrad,
NumericTypeID::kF32, // src tensor data type
LayoutTypeID::kTensorNCHW, // src tensor layout
NumericTypeID::kF32, // diff tensor data type
LayoutTypeID::kTensorNCHW, // diff tensor layout
NumericTypeID::kF32, // grad tensor data type
LayoutTypeID::kTensorNCHW, // grad tensor layout
NumericTypeID::kF32, // dummy argument, not used.
LayoutTypeID::kTensorNCHW, // dummy argument, not used
NumericTypeID::kF32,
cutlass::conv::ConvType::kDepthwiseConvolution,
m_algo_param.threadblock_m,
m_algo_param.threadblock_n,
m_algo_param.threadblock_k,
m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k,
1,
1,
1,
cutlass::epilogue::EpilogueType::kLinearCombination, // no bias
m_algo_param.stage,
cutlass::conv::SpecialOptimizeDesc::NONE,
1,
1,
true};
return (void*)Singleton::get().operation_table.find_op(key);
}

bool ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
const SizeArgs& args) const {
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE(is_compute_capability_required(6, 1));
RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() &&
args.grad_layout->is_contiguous());
using Param = param::Convolution;
using Format = Param::Format;
using Sparse = Param::Sparse;
using Mode = Param::Mode;
auto&& param = args.opr->param();
auto&& fm = args.grad_filter_meta;
RETURN_IF_FALSE(
param.format == Format::NCHW &&
args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.diff_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.grad_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
// check if channelwise convolution
RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
const auto* op = get_available_op(args);
RETURN_IF_FALSE(op != nullptr);
return true;
#undef RETURN_IF_FALSE
}

void ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
const ExecArgs& args) const {
auto&& param = args.opr->param();
auto&& fm = args.grad_filter_meta;
int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3);
int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
wo = args.diff_layout->operator[](3);
int co = fm.group, ci = co, groups = co;
int fh = fm.spatial[0], fw = fm.spatial[1];
int sh = fm.stride[0], sw = fm.stride[1];
int ph = fm.padding[0], pw = fm.padding[1];
int dh = param.dilate_h, dw = param.dilate_w;

// check if channelwise convolution
megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
auto&& stream = cuda_stream(args.opr->handle());

float alpha = 1.f;
float beta = 0.f;

const Operation* op = (const Operation*)get_available_op(args);

cutlass::conv::Conv2dProblemSize problem_size{
n, hi, wi, ci, co, fh, fw, ho,
wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
1, // split k slices, always 1
groups, // groups
};

cutlass::library::ConvolutionArguments conv_args{
problem_size,
args.src_tensor->raw_ptr(),
args.diff_tensor->raw_ptr(),
nullptr,
nullptr,
args.grad_tensor->raw_ptr(),
&alpha,
&beta,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr};

cutlass_check(op->run(&conv_args, nullptr, stream));

after_kernel_launch();
}

// vim: syntax=cpp.doxygen

+ 54
- 0
dnn/src/cuda/convolution/chanwise/bwd_large_filter.cu View File

@@ -0,0 +1,54 @@
/**
* \file dnn/src/cuda/conv_bias/chanwise/bwd_large_filter.cu
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "./kern.cuh"
#include "./kern_helper.cuh"
#include "cuda.h"
#include "cuda_fp16.h"
#include "src/cuda/convolution/chanwise/launch_config.cuh"
#include "src/cuda/fp16_help.cuh"

using namespace megdnn;
using namespace cuda;
using namespace convolution;
using namespace chanwise;

#include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh"

namespace megdnn {
namespace cuda {
namespace convolution {
namespace chanwise {

// =====================================fwd=====================================

template <>
void run_bwd_depthwise_large_filter(
float* dst, const float* src, const float* flt, const Param& param,
cudaStream_t stream) {
INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_BACKWARD)
}

#if CUDA_VERSION >= 9000
template <>
void run_bwd_depthwise_large_filter(
__half* dst, const __half* src, const __half* flt, const Param& param,
cudaStream_t stream) {
INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_BACKWARD)
}
#endif

} // namespace chanwise
} // namespace convolution
} // namespace cuda
} // namespace megdnn

// vim: syntax=cuda.doxygen

+ 13
- 6
dnn/src/cuda/convolution/chanwise/kern.cuh View File

@@ -27,8 +27,10 @@ namespace chanwise {
struct Param {
uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
pad_w, stride_h, stride_w, dilation_h, dilation_w;
bool is_compute_deafult;
#if MEGDNN_CC_HOST
static Param from_fwd_args(const ForwardSizeArgs& args) {
static Param from_fwd_args(
const ForwardSizeArgs& args, bool is_compute_deafult_ = true) {
#define U(v) static_cast<uint32_t>(v)
auto&& src = args.src_layout->shape;
auto&& dst = args.dst_layout->shape;
@@ -42,11 +44,12 @@ struct Param {
hw_pos = 1;
}
return {
U(src[0]), U(src[c_pos]), U(src[hw_pos]),
U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]),
U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]),
U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]),
U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]),
U(src[0]), U(src[c_pos]), U(src[hw_pos]),
U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]),
U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]),
U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]),
U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]),
is_compute_deafult_,
};
#undef U
}
@@ -64,6 +67,10 @@ void run_bwd_data(
cudaStream_t stream);

template <typename T>
void run_bwd_depthwise_large_filter(
T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

template <typename T>
void run_bwd_filter(
T* filter_grad, const T* src, const T* dst_grad, const Param& param,
cudaStream_t stream);


+ 33
- 0
dnn/src/cuda/convolution/opr_impl.cpp View File

@@ -115,6 +115,22 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
const AlgoAttribute& negative_attr) {
AlgoBase::SizeArgs args(this, filter, diff, grad);

//! choose for large kernel cases
size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
size_t ho = diff[2], wo = diff[3];
const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW &&
ho <= 2 * fh && wo <= 2 * fw;
if (prefer_dnn_lk_implbmm) {
#if CUDA_VERSION >= 10020
if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.implbmm_nchw_hmma[0];
#endif
if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.implbmm_nchw_fma[0];
}

if (args.filter_meta.group > 1 &&
sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
@@ -242,6 +258,23 @@ ConvolutionBackwardFilterImpl::Algorithm* ConvolutionBackwardFilterImpl::
const AlgoAttribute& negative_attr) {
AlgoBase::SizeArgs args(this, src, diff, grad);

//! choose for large kernel cases
size_t fh = args.grad_filter_meta.spatial[0], fw = args.grad_filter_meta.spatial[1];
size_t ho = diff[2], wo = diff[3];
const bool prefer_dnn_lk_implbmm =
args.grad_filter_meta.format == Param::Format::NCHW && ho <= 2 * fh &&
wo <= 2 * fw;
if (prefer_dnn_lk_implbmm) {
#if CUDA_VERSION >= 10020
if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.implbmm_nchw_hmma[0];
#endif
if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.implbmm_nchw_fma[0];
}

if (args.grad_filter_meta.group > 1 &&
sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {


+ 5
- 0
dnn/src/cuda/convolution/opr_impl.h View File

@@ -97,11 +97,14 @@ public:
class AlgoMatmul;
class AlgoChanwise;
class AlgoChanwiseSmall;
class AlgoDepthwiseLargeFilter;
class AlgoGroupConvGeneral;
class AlgoBFloat16;
class AlgoInt8NCHW4DotProdImplicitGemm;
class AlgoInt8NCHWDotProdImplicitGemm;
class AlgoInt8NHWCIMMAImplicitGemm;
class AlgoFloat32NCHWFMAImplicitBatchedGemm;
class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

class AlgoPack;

@@ -154,6 +157,8 @@ public:
class AlgoChanwise;
class AlgoGroupConvGeneral;
class AlgoBFloat16;
class AlgoFloat32NCHWFMAImplicitBatchedGemm;
class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

class AlgoPack;



+ 162
- 0
dnn/src/cuda/cutlass/convolution_operation.h View File

@@ -136,6 +136,15 @@ template <typename EpilogueOp, epilogue::EpilogueType type>
struct init_epilogue_param_;

template <typename EpilogueOp>
struct init_epilogue_param_<EpilogueOp, epilogue::EpilogueType::kLinearCombination> {
using ElementCompute = typename EpilogueOp::ElementCompute;
typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
return {*static_cast<ElementCompute const*>(conv_args->alpha),
*static_cast<ElementCompute const*>(conv_args->beta)};
}
};

template <typename EpilogueOp>
struct init_epilogue_param_<
EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> {
using ElementCompute = typename EpilogueOp::ElementCompute;
@@ -290,6 +299,159 @@ public:

///////////////////////////////////////////////////////////////////////////////////////////////////

/// We add a new template class to handle convolution backward filter operation, because
/// the device-level convolution operator of backward filter is different from the
/// others (convolution forward and convolution backward data).
/// But the description object is reused in this wrapper of convolution backward filter.
/// The reason is that we do not want to introduce an another unnecessary structure.
/// TODO: Maybe the device-level operator in cutlass for convoluton forward, backward
/// data and backward filter should be combined.
template <typename Operator_>
class ConvolutionBackwardFilterOperationBase : public Operation {
public:
using Operator = Operator_;
using ElementSrc = typename Operator::ElementSrc;
using LayoutSrc = typename Operator::LayoutSrc;
using ElementDiff = typename Operator::ElementDiff;
using LayoutDiff = typename Operator::LayoutDiff;
using ElementGrad = typename Operator::ElementGrad;
using LayoutGrad = typename Operator::LayoutGrad;
using ElementAccumulator = typename Operator::ElementAccumulator;

ConvolutionBackwardFilterOperationBase(char const* name = "unknown_convolution") {
m_description.name = name;
m_description.provider = Provider::kCUTLASS;
m_description.kind = OperationKind::kConvolution;
m_description.conv_op = Operator::kConvolutionalOperator;

m_description.tile_description.threadblock_shape = make_Coord(
Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN,
Operator::ThreadblockShape::kK);

m_description.tile_description.threadblock_stages = Operator::kStages;

m_description.tile_description.warp_count = make_Coord(
Operator::ConvolutionKernel::WarpCount::kM,
Operator::ConvolutionKernel::WarpCount::kN,
Operator::ConvolutionKernel::WarpCount::kK);

m_description.tile_description.math_instruction.instruction_shape = make_Coord(
Operator::InstructionShape::kM, Operator::InstructionShape::kN,
Operator::InstructionShape::kK);

m_description.tile_description.math_instruction.element_accumulator =
NumericTypeMap<ElementAccumulator>::kId;

m_description.tile_description.math_instruction.opcode_class =
OpcodeClassMap<typename Operator::OperatorClass>::kId;

m_description.tile_description.math_instruction.math_operation =
MathOperationMap<typename Operator::Operator>::kId;

m_description.tile_description.minimum_compute_capability =
ArchMap<typename Operator::ArchTag,
typename Operator::OperatorClass>::kMin;

m_description.tile_description.maximum_compute_capability =
ArchMap<typename Operator::ArchTag,
typename Operator::OperatorClass>::kMax;

/// src in description -> src in C++ template
m_description.src =
make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc);
/// filter in description -> diff in C++ template
m_description.filter = make_TensorDescription<ElementDiff, LayoutDiff>(
Operator::kAlignmentDiff);
/// dst in description -> grad in C++ template
m_description.dst = make_TensorDescription<ElementGrad, LayoutGrad>(
Operator::kAlignmentGrad);
/// because bias tensor is not used in ConvolutionBackwardFilter operation, the
/// following tensor description is a dummy arguments
m_description.bias = make_TensorDescription<ElementGrad, LayoutGrad>(
Operator::kAlignmentGrad);

m_description.convolution_type = Operator::kConvolutionType;
m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId;

m_description.epilogue_type = Operator::EpilogueOutputOp::kType;
m_description.epilogue_count = Operator::EpilogueOutputOp::kCount;

m_description.threadblock_swizzle =
ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId;

m_description.special_optimization = Operator::kSpecialOpt;
m_description.gemm_mode = Operator::kGemmMode;
/// ConvolutionBackwardFilter operation is only used for depthwise convolution,
/// so the option without_shared_load is always true
m_description.without_shared_load = true;
}

virtual OperationDescription const& description() const { return m_description; }

protected:
ConvolutionDescription m_description;
};

///////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Operator_>
class ConvolutionBackwardFilterOperation
: public ConvolutionBackwardFilterOperationBase<Operator_> {
public:
using Operator = Operator_;
using ElementSrc = typename Operator::ElementSrc;
using LayoutSrc = typename Operator::LayoutSrc;
using ElementDiff = typename Operator::ElementDiff;
using LayoutDiff = typename Operator::LayoutDiff;
using ElementGrad = typename Operator::ElementGrad;
using LayoutGrad = typename Operator::LayoutGrad;
using ElementAccumulator = typename Operator::ElementAccumulator;
using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;

using OperatorArguments = typename Operator::Arguments;

ConvolutionBackwardFilterOperation(char const* name = "unknown_gemm")
: ConvolutionBackwardFilterOperationBase<Operator_>(name) {}

virtual Status run(
void const* arguments_ptr, void* device_workspace = nullptr,
cudaStream_t stream = nullptr) const {
cutlass::conv::Operator conv_op = this->m_description.conv_op;
ConvolutionArguments const* conv_args =
reinterpret_cast<ConvolutionArguments const*>(arguments_ptr);
const auto& ps = conv_args->problem_size;

OperatorArguments args;
args.problem_size = ps;
/// src in convolution arguments -> ref_src
args.ref_src = {
static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)),
LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))};
/// filter in convolution arguments -> ref_diff
args.ref_diff = {
static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)),
LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))};
/// dst in convolution arguments -> ref_grad
args.ref_grad = {
static_cast<ElementGrad*>(conv_args->dst),
LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};

args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get(
conv_args);

Operator op;
Status status = op.initialize(args, device_workspace);

if (status != Status::kSuccess) {
return status;
}

return op.run(stream);
}
};

///////////////////////////////////////////////////////////////////////////////////////////////////

} // namespace library
} // namespace cutlass



+ 23
- 2
dnn/src/cuda/cutlass/initialize_all.cu View File

@@ -45,6 +45,11 @@ namespace library {

/////////////////////////////////////////////////////////////////////////////////////////////////
#if ((__CUDACC_VER_MAJOR__ > 10) || \
(__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
#define CUTLASS_ARCH_MMA_SM70_SUPPORTED 1
#endif

#if ((__CUDACC_VER_MAJOR__ > 10) || \
(__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
#endif
@@ -54,8 +59,16 @@ namespace library {
void initialize_all_gemm_simt_operations(Manifest& manifest);
void initialize_all_conv2d_simt_operations(Manifest& manifest);
void initialize_all_deconv_simt_operations(Manifest& manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest);
void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest);
void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest);
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest);
void initialize_all_dwconv2d_dgrad_tensorop884_operations(Manifest& manifest);
void initialize_all_dwconv2d_wgrad_tensorop884_operations(Manifest& manifest);
#endif
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
@@ -66,8 +79,16 @@ void initialize_all(Manifest& manifest) {
initialize_all_gemm_simt_operations(manifest);
initialize_all_conv2d_simt_operations(manifest);
initialize_all_deconv_simt_operations(manifest);
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
initialize_all_dwconv2d_fprop_simt_operations(manifest);
initialize_all_dwconv2d_dgrad_simt_operations(manifest);
initialize_all_dwconv2d_wgrad_simt_operations(manifest);
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
initialize_all_gemm_tensorop884_operations(manifest);
initialize_all_dwconv2d_fprop_tensorop884_operations(manifest);
initialize_all_dwconv2d_dgrad_tensorop884_operations(manifest);
initialize_all_dwconv2d_wgrad_tensorop884_operations(manifest);
#endif
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
initialize_all_gemm_tensorop1688_operations(manifest);
initialize_all_conv2d_tensorop8816_operations(manifest);
initialize_all_conv2d_tensorop8832_operations(manifest);


+ 3
- 0
dnn/src/cuda/cutlass/library.h View File

@@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID {
kConvolutionFpropTrans,
kConvolutionDgradNCxHWx,
kConvolutionDgradTrans,
kDepthwiseConvolutionFprop,
kDepthwiseConvolutionDgrad,
kDepthwiseConvolutionWgrad,
kInvalid
};



+ 21
- 0
dnn/src/cuda/cutlass/library_internal.h View File

@@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap<
ThreadblockSwizzleID::kConvolutionDgradTrans;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionFprop;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionDgrad;
};

template <>
struct ThreadblockSwizzleMap<
conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> {
static ThreadblockSwizzleID const kId =
ThreadblockSwizzleID::kDepthwiseConvolutionWgrad;
};

/////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Element, typename Layout>


+ 4
- 0
dnn/src/cuda/cutlass/operation_table.cpp View File

@@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
key.layout_dst = desc.dst.layout;
key.element_bias = desc.bias.element;
key.layout_bias = desc.bias.layout;
key.element_accumulator =
desc.tile_description.math_instruction.element_accumulator;

key.convolution_type = desc.convolution_type;

@@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)

key.stages = desc.tile_description.threadblock_stages;
key.special_optimization = desc.special_optimization;
key.alignment_src = desc.src.alignment;
key.alignment_filter = desc.filter.alignment;
key.without_shared_load = desc.without_shared_load;

return key;


+ 14
- 1
dnn/src/cuda/cutlass/operation_table.h View File

@@ -188,6 +188,7 @@ struct ConvolutionKey {
library::LayoutTypeID layout_dst;
library::NumericTypeID element_bias;
library::LayoutTypeID layout_bias;
NumericTypeID element_accumulator;

conv::ConvType convolution_type;

@@ -206,6 +207,10 @@ struct ConvolutionKey {
epilogue::EpilogueType epilogue_type;
int stages;
conv::SpecialOptimizeDesc special_optimization;

int alignment_src;
int alignment_filter;

bool without_shared_load;

inline bool operator==(ConvolutionKey const& rhs) const {
@@ -215,6 +220,7 @@ struct ConvolutionKey {
(layout_filter == rhs.layout_filter) &&
(element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) &&
(element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) &&
(element_accumulator == rhs.element_accumulator) &&
(convolution_type == rhs.convolution_type) &&
(threadblock_shape_m == rhs.threadblock_shape_m) &&
(threadblock_shape_n == rhs.threadblock_shape_n) &&
@@ -227,6 +233,8 @@ struct ConvolutionKey {
(instruction_shape_k == rhs.instruction_shape_k) &&
(epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
(special_optimization == rhs.special_optimization) &&
(alignment_src == rhs.alignment_src) &&
(alignment_filter == rhs.alignment_filter) &&
(without_shared_load == rhs.without_shared_load);
}

@@ -254,6 +262,7 @@ struct ConvolutionKey {
"\n layout_dst: " + to_string(layout_dst) +
"\n element_bias: " + to_string(element_bias) +
"\n layout_bias: " + to_string(layout_bias) +
"\n element_accumulator: " + to_string(element_accumulator) +
"\n convolution_type: " + to_string(convolution_type) +
"\n threadblock_shape: " + threadblock_shape_str +
"\n warp_shape: " + warp_shape_str +
@@ -261,6 +270,8 @@ struct ConvolutionKey {
"\n epilogue_type: " + to_string(epilogue_type) +
"\n stages: " + std::to_string(stages) +
"\n special_optimization: " + to_string(special_optimization) +
"\n alignment_src: " + std::to_string(alignment_src) +
"\n alignment_filter: " + std::to_string(alignment_filter) +
"\n without_shared_load: " + to_string(without_shared_load) + "\n}";
}
};
@@ -269,7 +280,6 @@ struct ConvolutionKeyHasher {
inline size_t operator()(ConvolutionKey const& key) const {
return Hash()
.update(&key.conv_op, sizeof(key.conv_op))
.update(&key.conv_op, sizeof(key.conv_op))
.update(&key.element_src, sizeof(key.element_src))
.update(&key.layout_src, sizeof(key.layout_src))
.update(&key.element_filter, sizeof(key.element_filter))
@@ -278,6 +288,7 @@ struct ConvolutionKeyHasher {
.update(&key.layout_dst, sizeof(key.layout_dst))
.update(&key.element_bias, sizeof(key.element_bias))
.update(&key.layout_bias, sizeof(key.layout_bias))
.update(&key.element_accumulator, sizeof(key.element_accumulator))
.update(&key.convolution_type, sizeof(key.convolution_type))
.update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m))
.update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n))
@@ -291,6 +302,8 @@ struct ConvolutionKeyHasher {
.update(&key.epilogue_type, sizeof(key.epilogue_type))
.update(&key.stages, sizeof(key.stages))
.update(&key.special_optimization, sizeof(key.special_optimization))
.update(&key.alignment_src, sizeof(key.alignment_src))
.update(&key.alignment_filter, sizeof(key.alignment_filter))
.update(&key.without_shared_load, sizeof(key.without_shared_load))
.digest();
}


+ 2
- 0
dnn/src/cuda/cutlass/util.cu View File

@@ -1322,6 +1322,8 @@ static struct {
{"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution},
{"local", "Local", conv::ConvType::kLocal},
{"local_share", "LocalShare", conv::ConvType::kLocalShare},
{"depthwise_convolution", "DepthwiseConvolution",
conv::ConvType::kDepthwiseConvolution},
};

/// Converts a ConvType enumerant to a string


+ 15
- 0
dnn/src/cuda/fp16_help.cuh View File

@@ -45,6 +45,21 @@ fma2(const __half2 a, const __half2 b, const __half2 c) {
#endif
}

__device__ __forceinline__ __half2 hadd2(const __half2 a, const __half2 b) {
#if __CUDA_ARCH__ >= 530
return __hadd2(a, b);
#else
return {__float2half(__half2float(a.x) + __half2float(b.x)),
__float2half(__half2float(a.y) + __half2float(b.y))};
#endif
}

__device__ __forceinline__ float2
fma2(const __half2 a, const __half2 b, const float2 c) {
return {__half2float(a.x) * __half2float(b.x) + c.x,
__half2float(a.y) * __half2float(b.y) + c.y};
}

#endif // CUDA_VERSION >= 9000

} // namespace cuda


+ 14
- 9
dnn/src/cuda/matrix_mul/algos.cpp View File

@@ -44,7 +44,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : simt_float32_gemv_batched_strided) {
all_algos.push_back(&algo);
}
#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
for (auto&& algo : tensorop_float16) {
all_algos.push_back(&algo);
}
@@ -113,21 +113,26 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
simt_float32_gemv_batched_strided.emplace_back(128);
simt_float32_gemv_batched_strided.emplace_back(64);
simt_float32_gemv_batched_strided.emplace_back(32);
#define FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) \
cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \
cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \
cb(128, 128, 32, 64, 64, 32, 8, 8, 4); \
cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \
cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \
#define FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) \
cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \
cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \
cb(128, 128, 32, 64, 64, 32, 8, 8, 4);
#define FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) \
cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \
cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \
cb(128, 128, 32, 64, 64, 32, 16, 8, 8);
#define cb(...) \
tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \
tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__});
#if CUDA_VERSION >= 10010
FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb)
#endif
#if CUDA_VERSION >= 10020
FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb)
FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb)
#endif
#undef cb
#undef FOREACH_CUTLASS_MATMUL_F16_SHAPES
#undef FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES
#undef FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES
}
#endif



+ 2
- 2
dnn/src/cuda/matrix_mul/algos.h View File

@@ -350,7 +350,7 @@ private:
std::string m_name;
};

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
class MatrixMulForwardImpl::AlgoFloat16TensorOp final
: public AlgoCutlassMatrixMulBase {
public:
@@ -418,7 +418,7 @@ public:
std::vector<AlgoFloat32SIMT> simt_float32;
std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided;
#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
std::vector<AlgoFloat16TensorOp> tensorop_float16;
std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k;
#endif


+ 1
- 1
dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp View File

@@ -15,7 +15,7 @@
#include "src/cuda/matrix_mul/algos.h"
#include "src/cuda/utils.h"

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
using namespace megdnn;
using namespace cuda;



+ 1
- 1
dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp View File

@@ -15,7 +15,7 @@
#include "src/cuda/matrix_mul/algos.h"
#include "src/cuda/utils.h"

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
using namespace megdnn;
using namespace cuda;



+ 2
- 0
dnn/src/cuda/matrix_mul/opr_impl.h View File

@@ -46,9 +46,11 @@ public:
class AlgoFloat32SIMT;
class AlgoFloat32SIMTSplitK;
class AlgoFloat32SIMTGemvBatchedStrided;
#if CUDA_VERSION >= 10010
class AlgoFloat16TensorOp;
class AlgoFloat16TensorOpSplitK;
#endif
#endif
class AlgoPack;

static const AlgoPack& algo_pack() { return sm_algo_pack; }


+ 1
- 0
dnn/src/cuda/padding/opr_impl.cpp View File

@@ -35,6 +35,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
param().padding_val, stream); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
}



+ 3
- 1
dnn/src/cuda/padding/padding.cu View File

@@ -60,7 +60,8 @@ __global__ void paddingConst_kernel(
params.src_stride[dim].divisor();
*/
}
dst[out_index] = in_src_valid_area ? src[in_index] : padding_val;
dst[out_index] =
in_src_valid_area ? src[in_index] : static_cast<T>(padding_val);
}
}

@@ -256,6 +257,7 @@ void padding_backward_proxy(
const float_t padding_val, cudaStream_t stream);
#define cb(DType) INST(typename DTypeTrait<DType>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
#undef INST



+ 6
- 3
dnn/src/naive/padding/opr_impl.cpp View File

@@ -171,7 +171,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
switch (param().padding_mode) {
case param::Padding::PaddingMode::CONSTANT:
#define cb(DType) \
if (src.layout.dtype == DType()) { \
if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
using T = typename DTypeTrait<DType>::ctype; \
MEGDNN_DISPATCH_CPU_KERN_OPR(exec_const_internal<T>( \
src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params, \
@@ -179,28 +179,31 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
break;
case param::Padding::PaddingMode::REPLICATE:
#define cb(DType) \
if (src.layout.dtype == DType()) { \
if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
using T = typename DTypeTrait<DType>::ctype; \
MEGDNN_DISPATCH_CPU_KERN_OPR(exec_replicate_internal<T>( \
src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
break;
case param::Padding::PaddingMode::REFLECT:
#define cb(DType) \
if (src.layout.dtype == DType()) { \
if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
using T = typename DTypeTrait<DType>::ctype; \
MEGDNN_DISPATCH_CPU_KERN_OPR(exec_reflect_internal<T>( \
src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
break;
default:


+ 1
- 0
dnn/test/common/checker.h View File

@@ -569,6 +569,7 @@ public:
});
return ret;
}
megdnn_assert(false, "Expected algo not found: %s\n", policy_name.name.c_str());
return ret;
}



+ 429
- 1
dnn/test/cuda/chanwise_convolution.cpp View File

@@ -20,6 +20,7 @@
#include "test/common/workspace_wrapper.h"
#include "test/cuda/benchmark.h"
#include "test/cuda/fixture.h"
#include "test/cuda/utils.h"

#include <cuda_profiler_api.h>
#include <cuda_runtime_api.h>
@@ -38,8 +39,10 @@ bool check_need_full_bench() {
}
#endif

Convolution::Param gconv_param(Convolution::Param p) {
Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
p.sparse = Convolution::Param::Sparse::GROUP;
if (io16xc32)
p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
return p;
}

@@ -421,6 +424,209 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
}
}

namespace {
template <typename Op>
struct AlgoCheckerMaker {
static auto make(const char* name, bool* require_algo) {
return AlgoChecker<Op>(name, require_algo);
}
};

template <>
struct AlgoCheckerMaker<ConvolutionForward> {
static auto make(const char* name, bool* require_algo) {
return AlgoChecker<ConvolutionForward>(
ExecutionPolicyAlgoName{
"DEFAULT",
{{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
name, {})
.c_str(),
{}}}},
require_algo);
}
};

template <typename Op>
void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
Checker<Op> checker(handle);
bool require_algo = false;
checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
bool io16xc32 = false;
if (io_type == dtype::Float16()) {
if (comp_type == dtype::Float16()) {
checker.set_epsilon(1e-1);
} else {
io16xc32 = true;
}
}
// dispatch testcase by operation
if (std::is_same<Op, ConvolutionForward>::value) {
// align 8
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
// align 1
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
// align 2
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
// custom padding
checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
// custom stride
checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
.execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
} else if (std::is_same<Op, ConvolutionBackwardData>::value) {
// align 8
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
// align 1
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
// align 2
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
// custom padding
checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
// custom stride
checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
.execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
} else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
// align 8
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {8, 2, 16, 16}, {2, 1, 1, 15, 15}});
// align 1
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 15, 15}, {8, 2, 15, 15}, {2, 1, 1, 15, 15}});
// align 2
checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
.execs({{8, 2, 14, 14}, {8, 2, 14, 14}, {2, 1, 1, 15, 15}});
// custom padding
checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
.execs({{8, 2, 16, 16}, {8, 2, 8, 8}, {2, 1, 1, 15, 15}});
// custom stride
checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
.execs({{8, 2, 14, 14}, {8, 2, 7, 7}, {2, 1, 1, 15, 15}});
}
}
} // namespace

#define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \
cb(1, 128, 128, 8, 32, 64, 8); \
cb(2, 128, 64, 8, 64, 32, 8); \
cb(3, 128, 32, 8, 64, 32, 8); \
cb(4, 64, 128, 8, 32, 64, 8); \
cb(5, 32, 128, 8, 32, 64, 8); \
cb(6, 64, 64, 8, 32, 64, 8); \
cb(7, 32, 64, 8, 32, 64, 8); \
cb(8, 32, 32, 8, 32, 32, 8); \
cb(9, 64, 32, 8, 64, 32, 8);

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionForward>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

#undef cb

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

#undef cb

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionBackwardFilter>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

#undef cb

#undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL

#if CUDA_VERSION >= 10010
#define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \
cb(1, 128, 128, 32, 32, 32, 32); \
cb(2, 128, 256, 32, 64, 64, 32); \
cb(3, 128, 64, 32, 32, 32, 32); \
cb(4, 64, 128, 32, 32, 32, 32); \
cb(5, 64, 64, 32, 32, 32, 32);
#else
// hmma instruction need cuda version >= 10.2, disable hmma testcases in this path
#define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
#endif

// check both ioc16 and io16xc32
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float32(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

#undef cb

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float16(), dtype::Float32(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

#undef cb

#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float16(), dtype::Float32(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
"_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
}

MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

#undef cb

#undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL

#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
auto handle = handle_cuda();
@@ -1123,6 +1329,228 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
// clang-format on
}

TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
size_t RUNS = 100;
bencher.set_display(false).set_times(RUNS);
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
bencher.set_proxy(proxy);

Convolution::Param param;
param.format = ConvBias::Param::Format::NCHW;
param.sparse = Convolution::Param::Sparse::GROUP;
NormalRNG rng;

auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
param.pad_h = f / 2;
param.pad_w = f / 2;
param.stride_h = s;
param.stride_w = s;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

TensorLayout dst_layout;
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(
{src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
float bandwith = static_cast<float>(
src.total_nr_elems() + filter.total_nr_elems() +
dst_layout.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

bencher.set_param(param)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;

bencher.set_param(param)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;

bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;

printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"float16: %.2fms %.2fGB/s "
"pseudo float16: %.2fms %.2fGB/s "
"speedup: "
"%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
time_in_ms_pseudo_fp16 / time_in_ms_fp16);
};

// clang-format off
for (size_t b : {32, 64})
for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
run(b, 384, 32, 32, f, 1);
run(b, 384, 64, 64, f, 1);
}
// clang-format on
}

TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
size_t RUNS = 100;
bencher.set_display(false).set_times(RUNS);
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
new OprProxy<ConvolutionBackwardData>{true}};
bencher.set_proxy(proxy);

Convolution::Param param;
param.format = ConvBias::Param::Format::NCHW;
param.sparse = Convolution::Param::Sparse::GROUP;
NormalRNG rng;

auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
param.pad_h = f / 2;
param.pad_w = f / 2;
param.stride_h = s;
param.stride_w = s;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

TensorLayout dst_layout;
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(
{src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
float bandwith = static_cast<float>(
src.total_nr_elems() + filter.total_nr_elems() +
dst_layout.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

bencher.set_param(param)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS;

bencher.set_param(param)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS;

bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS;

printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"float16: %.2fms %.2fGB/s "
"pseudo float16: %.2fms %.2fGB/s "
"speedup: "
"%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
time_in_ms_pseudo_fp16 / time_in_ms_fp16);
};

// clang-format off
for (size_t b : {32, 64})
for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
run(b, 384, 32, 32, f, 1);
run(b, 384, 64, 64, f, 1);
}
// clang-format on
}

TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_FILTER_LARGE_KERNEL) {
CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
size_t RUNS = 100;
bencher.set_display(false).set_times(RUNS);
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
new OprProxy<ConvolutionBackwardFilter>{true}};
bencher.set_proxy(proxy);

Convolution::Param param;
param.format = ConvBias::Param::Format::NCHW;
param.sparse = Convolution::Param::Sparse::GROUP;
NormalRNG rng;

auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
param.pad_h = f / 2;
param.pad_w = f / 2;
param.stride_h = s;
param.stride_w = s;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

TensorLayout dst_layout;
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(
{src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
float bandwith = static_cast<float>(
src.total_nr_elems() + filter.total_nr_elems() +
dst_layout.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

bencher.set_param(param)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
auto time_in_ms_fp32 = bencher.execs({src, src, filter}) / RUNS;

bencher.set_param(param)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_execution_policy = {};
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, src, filter}) / RUNS;

printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"pseudo float16: %.2fms %.2fGB/s "
"speedup: "
"%0.2f (fp16/fp32) \n",
s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
bandwith * 4 / time_in_ms_fp32, time_in_ms_pseudo_fp16,
bandwith * 2 / time_in_ms_pseudo_fp16,
time_in_ms_fp32 / time_in_ms_pseudo_fp16);
};

// clang-format off
for (size_t b : {32, 64})
for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
run(b, 384, 32, 32, f, 1);
run(b, 384, 64, 64, f, 1);
}
// clang-format on
}
#endif

// vim: syntax=cpp.doxygen

+ 202
- 1
dnn/test/cuda/conv_bias.cpp View File

@@ -695,6 +695,78 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_SMALL) {
}
}

TEST_F(CUDA, CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
ConvBiasForward::algo_name<ConvBias::DirectParam>(
"DEPTHWISE_LARGE_FILTER", {})
.c_str()));
for (auto dtype : std::vector<DType> {
dtype::Float32(),
#if CUDA_VERSION >= 9000
dtype::Float16()
#endif
}) {
auto run = [&checker, &dtype](
size_t n, size_t g, size_t h, size_t fh, size_t padding,
size_t stride) {
param::ConvBias cur_param;
cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
cur_param.sparse = ConvBias::Param::Sparse::GROUP;
checker.set_dtype(0, dtype)
.set_dtype(1, dtype)
.set_dtype(2, dtype)
.set_dtype(3, dtype)
.set_dtype(4, dtype);
float scale = 64.f / sqrt(fh * fh);
UniformFloatRNG rng(scale, 2 * scale);
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_rng(3, &rng)
.set_rng(4, &rng);
if (dtype.enumv() == DTypeEnum::Float16) {
checker.set_epsilon(1e-1);
}

cur_param.pad_h = cur_param.pad_w = padding;
cur_param.stride_h = cur_param.stride_w = stride;
checker.set_param(cur_param).execs(
{{n, g, h, h}, {g, 1, 1, fh, fh}, {}, {}, {}});
};
run(4, 8, 32, 5, 5 / 2, 1);
run(4, 8, 32, 7, 7 / 2, 1);
run(4, 8, 32, 9, 9 / 2, 1);
run(4, 8, 32, 11, 11 / 2, 1);
run(4, 8, 32, 13, 13 / 2, 1);
run(4, 8, 32, 15, 15 / 2, 1);
run(4, 8, 32, 17, 17 / 2, 1);
run(4, 8, 32, 19, 19 / 2, 1);
run(4, 8, 32, 21, 21 / 2, 1);
run(4, 8, 32, 23, 23 / 2, 1);
run(4, 8, 32, 25, 25 / 2, 1);
run(4, 8, 32, 27, 27 / 2, 1);
run(4, 8, 32, 29, 29 / 2, 1);
run(4, 8, 32, 31, 31 / 2, 1);
run(4, 8, 64, 5, 5 / 3, 2);
run(4, 8, 64, 7, 7 / 3, 2);
run(4, 8, 64, 9, 9 / 3, 2);
run(4, 8, 64, 11, 11 / 3, 2);
run(4, 8, 64, 13, 13 / 3, 2);
run(4, 8, 64, 15, 15 / 3, 2);
run(4, 8, 64, 17, 17 / 3, 2);
run(4, 8, 64, 19, 19 / 3, 2);
run(4, 8, 64, 21, 21 / 3, 2);
run(4, 8, 64, 23, 23 / 3, 2);
run(4, 8, 64, 25, 25 / 3, 2);
run(4, 8, 64, 27, 27 / 3, 2);
run(4, 8, 64, 29, 29 / 3, 2);
run(4, 8, 64, 31, 31 / 3, 2);
run(1, 2, 128, 31, 10, 2);
run(1, 2, 256, 31, 10, 2);
}
}

TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) {
require_compute_capability(6, 1);
Checker<ConvBiasForward> checker(handle_cuda());
@@ -1033,7 +1105,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CUDA:GROUP_CONV", {})
.c_str(),
{{"CUDNN", {}}}}));
{{"DEFAULT:CUDNN", {}}}}));
ConvBias::Param param;
param.sparse = ConvBias::Param::Sparse::GROUP;
param.nonlineMode = mode;
@@ -1093,8 +1165,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode);
// strided case
run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode);
// dilate conv is supported in CUDNN since version 7.5.0
#if CUDNN_VERSION >= 7500
// dilated case
run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode);
#endif
}
}

@@ -1471,6 +1546,132 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_TENSORCORE_INT8) {
run_bench(256, 512, 7, 7, 512, 3, 3, 1, 1, 1000);
run_bench(256, 512, 7, 7, 2048, 1, 1, 1, 1, 1000);
}

TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP16) {
require_compute_capability(7, 5);
Benchmarker<ConvBiasForward> bencher(handle_cuda());
bencher.set_display(false);
bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"DEPTHWISE_LARGE_FILTER", {})
.c_str()));

ConvBias::Param param;
param.format = ConvBias::Param::Format::NCHW;

using NonlineMode = ConvBias::Param::NonlineMode;
param.nonlineMode = NonlineMode::IDENTITY;
param.sparse = ConvBias::Param::Sparse::GROUP;
auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
size_t fw, size_t sh, size_t sw, size_t nr_times) {
param.pad_h = fh / 2;
param.pad_w = fw / 2;
param.stride_h = sh;
param.stride_w = sw;

bencher.set_param(param)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_dtype(4, dtype::Float16());
bencher.set_times(nr_times);
size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo};

float bandwith = static_cast<float>(
inp.total_nr_elems() + kern.total_nr_elems() +
out.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: "
"%.2fms, "
"perf: %.2f Tops bandwidth: %.2fGB/s.\n",
inp.to_string().c_str(), kern.to_string().c_str(),
out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms);
};

run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
}

TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP32) {
require_compute_capability(7, 5);
Benchmarker<ConvBiasForward> bencher(handle_cuda());
bencher.set_display(false);
bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"DEPTHWISE_LARGE_FILTER", {})
.c_str()));

ConvBias::Param param;
param.format = ConvBias::Param::Format::NCHW;

using NonlineMode = ConvBias::Param::NonlineMode;
param.nonlineMode = NonlineMode::IDENTITY;
param.sparse = ConvBias::Param::Sparse::GROUP;
auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
size_t fw, size_t sh, size_t sw, size_t nr_times) {
param.pad_h = fh / 2;
param.pad_w = fw / 2;
param.stride_h = sh;
param.stride_w = sw;

bencher.set_param(param)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_dtype(4, dtype::Float32());
bencher.set_times(nr_times);
size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo};

float bandwith = static_cast<float>(
inp.total_nr_elems() + kern.total_nr_elems() +
out.total_nr_elems()) /
(1024 * 1024 * 1024) * 1e3;

auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: "
"%.2fms, "
"perf: %.2f Tops bandwidth: %.2fGB/s.\n",
inp.to_string().c_str(), kern.to_string().c_str(),
out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms);
};

run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
}
#endif
#endif



+ 202
- 0
dnn/test/cuda/convolution.cpp View File

@@ -724,6 +724,70 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
TensorLayoutArray{filter, dst, src});
}

TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
Checker<ConvolutionBackwardData> checker(handle_cuda());
checker.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
for (auto dtype : std::vector<DType> {
dtype::Float32(),
#if CUDA_VERSION >= 9000
dtype::Float16()
#endif
}) {
auto run = [&checker, &dtype](
size_t n, size_t g, size_t h, size_t fh, size_t padding,
size_t stride) {
param::Convolution param;
param.stride_h = param.stride_w = stride;
param.pad_h = param.pad_w = padding;
param.mode = Convolution::Mode::CROSS_CORRELATION;
param.sparse = param::Convolution::Sparse::GROUP;
checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
float scale = 64.f / sqrt(fh * fh);
UniformFloatRNG rng(scale, scale * 2);
checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
if (dtype.enumv() == DTypeEnum::Float16)
checker.set_epsilon(1e-1);

checker.set_param(param).execs(
{{g, 1, 1, fh, fh},
{n, g, (h + 2 * padding - fh + 1) / stride,
(h + 2 * padding - fh + 1) / stride},
{n, g, h, h}});
};
run(4, 8, 32, 5, 5 / 2, 1);
run(4, 8, 32, 7, 7 / 2, 1);
run(4, 8, 32, 9, 9 / 2, 1);
run(4, 8, 32, 11, 11 / 2, 1);
run(4, 8, 32, 13, 13 / 2, 1);
run(4, 8, 32, 15, 15 / 2, 1);
run(4, 8, 32, 17, 17 / 2, 1);
run(4, 8, 32, 19, 19 / 2, 1);
run(4, 8, 32, 21, 21 / 2, 1);
run(4, 8, 32, 23, 23 / 2, 1);
run(4, 8, 32, 25, 25 / 2, 1);
run(4, 8, 32, 27, 27 / 2, 1);
run(4, 8, 32, 29, 29 / 2, 1);
run(4, 8, 32, 31, 31 / 2, 1);
run(4, 8, 64, 5, 5 / 2, 2);
run(4, 8, 64, 7, 7 / 3, 2);
run(4, 8, 64, 9, 9 / 3, 2);
run(4, 8, 64, 11, 11 / 3, 2);
run(4, 8, 64, 13, 13 / 3, 2);
run(4, 8, 64, 15, 15 / 3, 2);
run(4, 8, 64, 17, 17 / 3, 2);
run(4, 8, 64, 19, 19 / 3, 2);
run(4, 8, 64, 21, 21 / 3, 2);
run(4, 8, 64, 23, 23 / 3, 2);
run(4, 8, 64, 25, 25 / 3, 2);
run(4, 8, 64, 27, 27 / 3, 2);
run(4, 8, 64, 29, 29 / 3, 2);
run(4, 8, 64, 31, 31 / 3, 2);
run(1, 2, 128, 31, 31 / 3, 2);
run(1, 2, 256, 31, 31 / 3, 2);
}
}

#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, CONV_FWD_BENCHMARK) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
@@ -901,6 +965,104 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
run(32, 64, 64, 56, 56, 1, 1, 0);
}

TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
bencher.set_display(false);
bencher.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));

auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
size_t SH, size_t nr_times) {
bencher.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32());
param::Convolution param;
param.stride_h = param.stride_w = SH;
param.pad_h = param.pad_w = FH / 2;
param.sparse = param::Convolution::Sparse::GROUP;
bencher.set_param(param);
bencher.set_times(nr_times);
TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
filter{{g, 1, 1, FH, FH}, dtype::Float32()};
TensorLayout dst;
{
auto&& opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(src, filter, dst);
}
auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
filter.to_string().c_str(), dst.to_string().c_str());
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
(flo / (time_ms_fp32 * 1e9)));
};
run(64, 384, 384, 32, 32, 3, 1, 10);
run(64, 384, 384, 32, 32, 5, 1, 10);
run(64, 384, 384, 32, 32, 7, 1, 10);
run(64, 384, 384, 32, 32, 9, 1, 10);
run(64, 384, 384, 32, 32, 11, 1, 10);
run(64, 384, 384, 32, 32, 13, 1, 10);
run(64, 384, 384, 32, 32, 15, 1, 10);
run(64, 384, 384, 32, 32, 17, 1, 10);
run(64, 384, 384, 32, 32, 19, 1, 10);
run(64, 384, 384, 32, 32, 21, 1, 10);
run(64, 384, 384, 32, 32, 23, 1, 10);
run(64, 384, 384, 32, 32, 25, 1, 10);
run(64, 384, 384, 32, 32, 27, 1, 10);
run(64, 384, 384, 32, 32, 29, 1, 10);
run(64, 384, 384, 32, 32, 31, 1, 10);
}

TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
bencher.set_display(false);
bencher.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));

auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
size_t SH, size_t nr_times) {
bencher.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
param::Convolution param;
param.stride_h = param.stride_w = SH;
param.pad_h = param.pad_w = FH / 2;
param.sparse = param::Convolution::Sparse::GROUP;
bencher.set_param(param);
bencher.set_times(nr_times);
TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
filter{{g, 1, 1, FH, FH}, dtype::Float16()};
TensorLayout dst;
{
auto&& opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(src, filter, dst);
}
auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
filter.to_string().c_str(), dst.to_string().c_str());
printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
(flo / (time_ms_fp16 * 1e9)));
};
run(64, 384, 384, 32, 32, 3, 1, 10);
run(64, 384, 384, 32, 32, 5, 1, 10);
run(64, 384, 384, 32, 32, 7, 1, 10);
run(64, 384, 384, 32, 32, 9, 1, 10);
run(64, 384, 384, 32, 32, 11, 1, 10);
run(64, 384, 384, 32, 32, 13, 1, 10);
run(64, 384, 384, 32, 32, 15, 1, 10);
run(64, 384, 384, 32, 32, 17, 1, 10);
run(64, 384, 384, 32, 32, 19, 1, 10);
run(64, 384, 384, 32, 32, 21, 1, 10);
run(64, 384, 384, 32, 32, 23, 1, 10);
run(64, 384, 384, 32, 32, 25, 1, 10);
run(64, 384, 384, 32, 32, 27, 1, 10);
run(64, 384, 384, 32, 32, 29, 1, 10);
run(64, 384, 384, 32, 32, 31, 1, 10);
}

TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
@@ -1065,6 +1227,46 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
run(32, 512, 1024, 14, 14, 1, 2, 0);
run(32, 64, 64, 56, 56, 1, 1, 0);
}

TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
new OprProxy<ConvolutionBackwardFilter>{true}};
size_t RUNS = 10;
bench.set_proxy(proxy).set_times(RUNS);

bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
"CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));

auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
size_t SH, size_t PH) {
bench.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32());
param::Convolution param;
param.stride_h = param.stride_w = SH;
param.pad_h = param.pad_w = FH / 2;
param.sparse = param::Convolution::Sparse::GROUP;
bench.set_param(param);
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
filter{{g, 1, 1, FH, FH}, dtype::Float32()};
TensorLayout dst;
{
auto&& opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(src, filter, dst);
}
auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
filter.to_string().c_str(), dst.to_string().c_str());
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
(flo / (time_ms_fp32 * 1e9)));
};
run(64, 384, 384, 32, 32, 31, 1, 15);
}

#endif

#undef CUDNN_VERSION_STRING


+ 5
- 3
dnn/test/cuda/cutlass_matmul.cpp View File

@@ -213,7 +213,7 @@ std::vector<BenchArgs> get_feat_model_args() {
return args;
}

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
std::vector<BenchArgs> get_f16_feat_model_args() {
std::vector<BenchArgs> args;
args.emplace_back(BenchArgs{128, 9216, 9216});
@@ -367,7 +367,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
#undef cb
#undef MEGDNN_FOREACH_CUTLASS_KERNEL

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
#define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \
cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \
cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \
@@ -403,7 +403,9 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
#undef cb

#undef MEGDNN_FOREACH_CUTLASS_KERNEL
#endif

#if CUDA_VERSION >= 10020
#define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \
cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \
cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \
@@ -454,7 +456,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) {
dtype::Float32(), "CUTLASS_FLOAT32_SIMT");
}

#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10010
TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) {
benchmark_matrix_mul(
handle_cuda(), get_f16_feat_model_args(), dtype::Float16(),


+ 30
- 0
dnn/test/cuda/padding.cpp View File

@@ -101,6 +101,36 @@ TEST_F(CUDA, PADDING_REFLECT2) {
4, 1, 6, 3, 6, 1, 6, 3})});
}

TEST_F(CUDA, PADDING_REFLECT2_QUANTIZED) {
Checker<Padding> checker(handle_cuda(), false);
param::Padding param;
param.padding_mode = param::Padding::PaddingMode::REFLECT;
param.front_offset_dim0 = 2;
param.front_offset_dim1 = 1;
param.front_offset_dim2 = 0;
param.front_offset_dim3 = 0;
param.front_offset_dim4 = 0;
param.front_offset_dim5 = 0;
param.front_offset_dim6 = 0;
param.back_offset_dim0 = 0;
param.back_offset_dim1 = 2;
param.back_offset_dim2 = 0;
param.back_offset_dim3 = 0;
param.back_offset_dim4 = 0;
param.back_offset_dim5 = 0;
param.back_offset_dim6 = 0;
checker.set_param(param).exect(
Testcase{
TensorValue(
{3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}),
{}},
Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5,
4, 5, 6, 5, 4, 2, 1,
2, 3, 2, 1, 5, 4, 5,
6, 5, 4, 8, 7, 8, 9,
8, 7})});
}

TEST_F(CUDA, PADDING_REPLICATE) {
Checker<Padding> checker(handle_cuda(), false);
param::Padding param;


+ 30
- 0
dnn/test/naive/padding.cpp View File

@@ -83,6 +83,36 @@ TEST_F(NAIVE, PADDING_REFLECT) {
{10}, dtype::Float32(), {3, 2, 1, 2, 3, 4, 5, 4, 3, 2})});
}

TEST_F(NAIVE, PADDING_REFLECT2) {
Checker<Padding> checker(handle(), false);
param::Padding param;
param.padding_mode = param::Padding::PaddingMode::REFLECT;
param.front_offset_dim0 = 2;
param.front_offset_dim1 = 1;
param.front_offset_dim2 = 0;
param.front_offset_dim3 = 0;
param.front_offset_dim4 = 0;
param.front_offset_dim5 = 0;
param.front_offset_dim6 = 0;
param.back_offset_dim0 = 0;
param.back_offset_dim1 = 2;
param.back_offset_dim2 = 0;
param.back_offset_dim3 = 0;
param.back_offset_dim4 = 0;
param.back_offset_dim5 = 0;
param.back_offset_dim6 = 0;
checker.set_param(param).exect(
Testcase{
TensorValue(
{3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}),
{}},
Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5,
4, 5, 6, 5, 4, 2, 1,
2, 3, 2, 1, 5, 4, 5,
6, 5, 4, 8, 7, 8, 9,
8, 7})});
}

TEST_F(NAIVE, PADDING_REPLICATE) {
Checker<Padding> checker(handle(), false);
param::Padding param;


+ 9
- 5
imperative/python/megengine/core/tensor/indexing.py View File

@@ -119,12 +119,16 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
else 1
)
else:
if ndim_indexed > inp.ndim:
raise IndexError(
"too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
inp.ndim, len(tuple_val)
try:
if ndim_indexed > inp.ndim:
raise IndexError(
"too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
inp.ndim, len(tuple_val)
)
)
)
except ValueError:
# ignore
pass

tuple_val = remove_ellipsis(inp, tuple_val)
use_subtensor = True


+ 56
- 2
imperative/python/megengine/module/conv.py View File

@@ -18,6 +18,7 @@ from ..functional import (
conv_transpose3d,
deformable_conv2d,
local_conv2d,
pad,
relu,
)
from ..tensor import Parameter
@@ -126,7 +127,7 @@ class Conv1d(_ConvNd):
kernel_size: size of weight on spatial dimensions.
stride: stride of the 1D convolution operation.
padding: size of the paddings added to the input on both sides of its
spatial dimensions. Only zero-padding is supported. Default: 0
spatial dimensions. Default: 0
dilation: dilation of the 1D convolution operation. Default: 1
groups: number of groups to divide input and output channels into,
so as to perform a "grouped convolution". When ``groups`` is not 1,
@@ -139,6 +140,8 @@ class Conv1d(_ConvNd):
placed on the precision of intermediate results. When set to "float32",
"float32" would be used for accumulator and intermediate result, but only
effective when input and output are of float16 dtype.
padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
Refer to :class:`~.module.padding.Pad` for more information.

Note:
* ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
@@ -177,6 +180,7 @@ class Conv1d(_ConvNd):
bias: bool = True,
conv_mode: str = "cross_correlation",
compute_mode: str = "default",
padding_mode: str = "zeros",
**kwargs
):
kernel_size = kernel_size
@@ -185,6 +189,7 @@ class Conv1d(_ConvNd):
dilation = dilation
self.conv_mode = conv_mode
self.compute_mode = compute_mode
self.padding_mode = padding_mode
super().__init__(
in_channels,
out_channels,
@@ -223,7 +228,27 @@ class Conv1d(_ConvNd):
# Assume format is NCH(W=1)
return (1, self.out_channels, 1)

def get_pad_witdth(self):
return ((0, 0), (0, 0), (self.padding, self.padding))

def calc_conv(self, inp, weight, bias):
assert self.padding_mode in [
"zeros",
"reflect",
"replicate",
]
if self.padding_mode != "zeros":
return conv1d(
pad(inp, self.get_pad_witdth(), self.padding_mode),
weight,
bias,
self.stride,
0,
self.dilation,
self.groups,
self.conv_mode,
self.compute_mode,
)
return conv1d(
inp,
weight,
@@ -287,7 +312,7 @@ class Conv2d(_ConvNd):
``(kernel_size, kernel_size)``.
stride: stride of the 2D convolution operation. Default: 1
padding: size of the paddings added to the input on both sides of its
spatial dimensions. Only zero-padding is supported. Default: 0
spatial dimensions. Default: 0
dilation: dilation of the 2D convolution operation. Default: 1
groups: number of groups into which the input and output channels are divided,
so as to perform a ``grouped convolution``. When ``groups`` is not 1,
@@ -300,6 +325,8 @@ class Conv2d(_ConvNd):
placed on the precision of intermediate results. When set to "float32",
"float32" would be used for accumulator and intermediate result, but only
effective when input and output are of float16 dtype.
padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
Refer to :class:`~.module.padding.Pad` for more information.

Note:
* ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
@@ -338,6 +365,7 @@ class Conv2d(_ConvNd):
bias: bool = True,
conv_mode: str = "cross_correlation",
compute_mode: str = "default",
padding_mode: str = "zeros",
**kwargs
):
kernel_size = _pair_nonzero(kernel_size)
@@ -346,6 +374,7 @@ class Conv2d(_ConvNd):
dilation = _pair_nonzero(dilation)
self.conv_mode = conv_mode
self.compute_mode = compute_mode
self.padding_mode = padding_mode
super().__init__(
in_channels,
out_channels,
@@ -384,7 +413,32 @@ class Conv2d(_ConvNd):
# Assume format is NCHW
return (1, self.out_channels, 1, 1)

def get_pad_witdth(self):
return (
(0, 0),
(0, 0),
(self.padding[0], self.padding[0]),
(self.padding[1], self.padding[1]),
)

def calc_conv(self, inp, weight, bias):
assert self.padding_mode in [
"zeros",
"reflect",
"replicate",
]
if self.padding_mode != "zeros":
return conv2d(
pad(inp, self.get_pad_witdth(), self.padding_mode),
weight,
bias,
self.stride,
0,
self.dilation,
self.groups,
self.conv_mode,
self.compute_mode,
)
return conv2d(
inp,
weight,


+ 2
- 0
imperative/python/megengine/module/conv_bn.py View File

@@ -30,6 +30,7 @@ class _ConvBnActivation2d(Module):
momentum=0.9,
affine=True,
track_running_stats=True,
padding_mode: str = "zeros",
**kwargs
):
super().__init__(**kwargs)
@@ -44,6 +45,7 @@ class _ConvBnActivation2d(Module):
bias,
conv_mode,
compute_mode,
padding_mode,
**kwargs,
)
self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats)


+ 7
- 5
imperative/python/megengine/module/module.py View File

@@ -138,11 +138,7 @@ class Module(metaclass=ABCMeta):
return HookHandler(self._forward_hooks, hook)

def __call__(self, *inputs, **kwargs):
AutoNaming.push_scope(
self.name
if self.name is not None
else (self._short_name if hasattr(self, "_short_name") else self._name)
)
AutoNaming.push_scope(self.name if self.name is not None else self._short_name)
for hook in self._forward_pre_hooks.values():
modified_inputs = hook(self, inputs)
if modified_inputs is not None:
@@ -685,6 +681,12 @@ class Module(metaclass=ABCMeta):
set_name(self, prefix, k, v)
super().__setattr__(name, value)

def __setstate__(self, state):
if "_short_name" not in state:
state["_short_name"] = state["_name"]
state["_name"] = None
self.__dict__.update(state)

def __delattr__(self, name: str):
if name in self.__dict__ and _is_module(self.__dict__[name]):
modules = self.__dict__.get("_modules")


+ 1
- 0
imperative/python/megengine/module/qat/conv.py View File

@@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QATModule):
float_module.bias is not None,
float_module.conv_mode,
float_module.compute_mode,
float_module.padding_mode,
name=float_module.name,
)
qat_module.weight = float_module.weight


+ 1
- 0
imperative/python/megengine/module/qat/conv_bn.py View File

@@ -147,6 +147,7 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
float_module.conv.bias is not None,
float_module.conv.conv_mode,
float_module.conv.compute_mode,
padding_mode=float_module.conv.padding_mode,
name=float_module.name,
)
qat_module.conv.weight = float_module.conv.weight


+ 23
- 1
imperative/python/megengine/module/quantized/conv.py View File

@@ -11,7 +11,7 @@ import numpy as np

from ... import module as Float
from ...core.tensor import dtype
from ...functional.nn import conv_bias_activation
from ...functional.nn import conv_bias_activation, pad
from ...functional.quantized import conv_transpose2d
from ...tensor import Parameter
from ..qat import conv as QAT
@@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
conv_mode: str = "cross_correlation",
compute_mode: str = "default",
dtype=None,
padding_mode: str = "zeros",
**kwargs
):
super().__init__(
@@ -51,13 +52,33 @@ class Conv2d(Float.Conv2d, QuantizedModule):
True,
conv_mode,
compute_mode,
padding_mode,
)
self.output_dtype = dtype

def calc_conv_quantized(self, inp, nonlinear_mode="identity"):
assert self.padding_mode in [
"zeros",
"reflect",
"replicate",
]
inp_scale = dtype.get_scale(inp.dtype)
w_scale = dtype.get_scale(self.weight.dtype)
bias_scale = inp_scale * w_scale
if self.padding_mode != "zeros":
return conv_bias_activation(
pad(inp, self.get_pad_witdth(), self.padding_mode),
self.weight,
self.bias.astype(dtype.qint32(bias_scale)),
self.output_dtype,
self.stride,
0,
self.dilation,
self.groups,
conv_mode=self.conv_mode,
compute_mode=self.compute_mode,
nonlinear_mode=nonlinear_mode,
)
return conv_bias_activation(
inp,
self.weight,
@@ -88,6 +109,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
qat_module.dilation,
qat_module.groups,
dtype=output_dtype,
padding_mode=qat_module.padding_mode,
name=qat_module.name,
)
weight = qat_module.weight.astype(qat_module.get_weight_dtype())


+ 1
- 0
imperative/python/megengine/module/quantized/conv_bn.py View File

@@ -31,6 +31,7 @@ class _ConvBnActivation2d(Conv2d):
qat_module.conv.groups,
dtype=output_dtype,
name=qat_module.name,
padding_mode=qat_module.conv.padding_mode,
)
w_fold, b_fold = qat_module.fold_weight_bias(
qat_module.bn.running_mean, qat_module.bn.running_var


+ 36
- 0
imperative/python/megengine/traced_module/compat.py View File

@@ -126,6 +126,9 @@ def convbn2d_module_loader(expr):
module = expr.inputs[0].owner
if not hasattr(module.bn, "param_dim"):
module.bn.param_dim = "dim_1c11"
module = expr.inputs[0].owner
if not hasattr(module.conv, "padding_mode"):
module.conv.padding_mode = "zeros"


@register_opdef_loader(BatchNorm)
@@ -162,3 +165,36 @@ def tensor_gen_func_loader(expr):
else:
device = None
expr.set_args_kwargs(shape, dtype=dtype, device=device)


@register_functional_loader(("megengine.functional.nn", "pad"))
def pad_func_loader(expr):
if "pad_witdth" in expr.kwargs:
kwargs = expr.kwargs
kwargs["pad_width"] = kwargs.pop("pad_witdth")
expr.set_args_kwargs(*expr.args, **kwargs)


@register_module_loader(
("megengine.module.conv", "Conv1d"),
("megengine.module.conv", "Conv2d"),
("megengine.module.conv", "ConvRelu2d"),
("megengine.module.qat.conv", "Conv2d"),
("megengine.module.qat.conv", "ConvRelu2d"),
("megengine.module.quantized.conv", "Conv2d"),
("megengine.module.quantized.conv", "ConvRelu2d"),
)
def conv2d_module_loader(expr):
module = expr.inputs[0].owner
if not hasattr(module, "padding_mode"):
module.padding_mode = "zeros"


@register_module_loader(
("megengine.module.quantized.conv_bn", "ConvBn2d"),
("megengine.module.quantized.conv_bn", "ConvBnRelu2d"),
)
def quantized_convbn2d_module_loader(expr):
module = expr.inputs[0].owner
if not hasattr(module, "padding_mode"):
module.padding_mode = "zeros"

+ 1
- 1
imperative/python/megengine/traced_module/serialization.py View File

@@ -50,7 +50,7 @@ class _ModuleState:
if self.obj is None:
typem = getattr(import_module(self.module[0]), self.module[1])
m_obj = typem.__new__(typem)
m_obj.__dict__.update(self.state)
m_obj.__setstate__(self.state)
self.obj = m_obj
return self.obj



+ 6
- 4
imperative/python/megengine/traced_module/traced_module.py View File

@@ -1681,11 +1681,13 @@ class TracedModuleBuilder(NodeMixin):

if isinstance(wrapped, TracedModuleBuilder):
if not isinstance(mod_attr, (List, Dict, QATModule)):
assert mod_attr is wrapped._mod
else:
assert (
mod_attr is wrapped._mod
), "TracedModule do not support modify module attributes, please check your code."
if isinstance(wrapped, RawTensor):
assert (
mod_attr is wrapped
), "TracedModule do not support modify attributes, please check your code."
), "TracedModule do not support modify tensor attributes, please check your code."

if isinstance(wrapped, (NodeMixin, RawTensor)):
NodeMixin.wrap(
@@ -2296,7 +2298,7 @@ class TracedModule(Module):
for k, v in state.items():
if isinstance(v, _ModuleState):
state[k] = v.to_module()
self.__dict__.update(state)
super().__setstate__(state)
self._update_ref()

for _, graph in self.argdef_graph_map.items():


+ 3
- 7
imperative/python/src/tensor.cpp View File

@@ -272,16 +272,12 @@ PyObject* TensorWrapper::device() {

PyObject* TensorWrapper::numpy() {
auto hv = m_tensor->numpy();
// if (!hv) {
// PyErr_SetString(PyExc_ValueError, "tensor invalid");
// return nullptr;
// }
auto arr = py::reinterpret_steal<py::array>(
npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE));
if (!arr) {
if (!hv) {
PyErr_SetString(PyExc_ValueError, "tensor invalid");
return nullptr;
}
auto arr = py::reinterpret_steal<py::array>(
npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE));
if (hv->shape().is_scalar()) {
mgb_assert(PyArray_Check(arr.ptr()));
return PyArray_Squeeze(reinterpret_cast<PyArrayObject*>(arr.ptr()));


+ 27
- 0
imperative/python/test/unit/core/test_indexing_op.py View File

@@ -7,6 +7,8 @@
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import collections
import platform
from tempfile import NamedTemporaryFile

import numpy as np
import pytest
@@ -16,6 +18,8 @@ import megengine
import megengine.core.tensor.megbrain_graph as G
import megengine.functional as F
import megengine.jit as jit
import megengine.random as rand
import megengine.utils.comp_graph_tools as cgtools
from megengine.core._imperative_rt.core2 import apply
from megengine.core._trace_option import use_symbolic_shape
from megengine.core.ops import builtin
@@ -724,3 +728,26 @@ def test_nd_int_indexing(symbolic):
np.testing.assert_equal(out.numpy(), npy_out)

run_test([inp, idx], lambda inp, idx: inp[idx])


@pytest.mark.skipif(
platform.system() == "Windows", reason="windows temp file issue, fixme later"
)
def test_subtensor_when_shape_invalid():
@jit.trace(symbolic=True, capture_as_const=True)
def fun(inp):
shape = inp.shape
H = shape[-1]
NH = H * 8 + 4
arr = F.arange(4, NH, 8)
arr_shape = arr.shape
return arr_shape[0]

inp = rand.uniform(size=[1, 3, 224, 224])
fun(inp)

with NamedTemporaryFile() as f:
fun.dump(f.name, arg_names=["data"], optimize_for_inference=True)
inp = rand.uniform(size=[1, 3, 512, 512])
net = cgtools.GraphInference(f.name)
net.run(inp_dict={"data": inp})

+ 14
- 0
imperative/python/test/unit/core/test_serialization.py View File

@@ -87,3 +87,17 @@ def test_compatibility():

test_old_tensor("tensor_v1_1.mge")
test_old_tensor("tensor_v1_2.mge")

t = mge.tensor([1])
getattr(t, "qparams")
new_args = t.__getnewargs__()
assert (
len(new_args) == 3
and isinstance(new_args[0], np.ndarray)
and new_args[1] == np.int32
and isinstance(new_args[2], str)
), "Modify Tensor __getnewargs__ may break pickle serialization compatible"
state = t.__getstate__()
assert set(state.keys()) == set(
["qparams"]
), "Modify Tensor __getstate__ may break pickle serialization compatible"

+ 24
- 0
imperative/python/test/unit/module/test_module.py View File

@@ -681,3 +681,27 @@ def test_repr_module_reset_attr():
m1 = ResetAttrModule(False)
output = [m0.__repr__(), m1.__repr__()]
assert output == ground_truth


def test_module_compatible():
class Empty(Module):
def forward(self):
pass

empty_module = Empty()
old_attributes = set(
[
"_modules",
"name",
"training",
"quantize_disabled",
"_forward_pre_hooks",
"_forward_hooks",
"_name",
"_short_name",
]
)
current_attributes = set(empty_module.__dict__.keys())
assert (
old_attributes == current_attributes
), "Add or delete attributes in Module class may break compatibility of pickle serialization"

+ 19
- 2
imperative/python/test/unit/module/test_qat.py View File

@@ -60,7 +60,18 @@ def test_qat_convbn2d():
)


def test_qat_conv():
@pytest.mark.parametrize(
"padding, padding_mode",
[
(0, "zeros"),
((1, 2), "zeros"),
(3, "reflect"),
((1, 2), "reflect"),
(4, "replicate"),
((1, 2), "replicate"),
],
)
def test_qat_conv(padding, padding_mode):

in_channels = 32
out_channels = 64
@@ -72,7 +83,13 @@ def test_qat_conv():
self.quant = QuantStub()
self.dequant = DequantStub()
self.conv = Conv2d(
in_channels, out_channels, kernel_size, groups=groups, bias=bias
in_channels,
out_channels,
kernel_size,
groups=groups,
bias=bias,
padding=padding,
padding_mode=padding_mode,
)
self.conv_relu = ConvRelu2d(
out_channels, in_channels, kernel_size, groups=groups, bias=bias


+ 8
- 3
imperative/python/test/unit/quantization/test_module.py View File

@@ -236,11 +236,16 @@ def test_linear():


@pytest.mark.parametrize("module", ["Conv2d", "ConvBn2d", "ConvBnRelu2d"])
def test_conv(module):
normal_net = getattr(Float, module)(3, 3, 3, 1, 1, 1, bias=True)
@pytest.mark.parametrize("padding_mode", ["zeros", "reflect", "replicate"])
def test_conv(module, padding_mode):
normal_net = getattr(Float, module)(
3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode
)
normal_net.eval()

qat_net = getattr(QAT, module)(3, 3, 3, 1, 1, 1, bias=True)
qat_net = getattr(QAT, module)(
3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode
)
qat_net.eval()
disable_observer(qat_net)



+ 14
- 7
imperative/src/impl/transformations/scalar.cpp View File

@@ -51,6 +51,7 @@ bool is_scalar_shape(ValueRef shape) {
if (shape.is<ScalarValue>()) {
return false;
}
// may have performance issue
auto shape_of_shape = shape.shape();
if (!shape_of_shape) {
// assume not scalar
@@ -211,14 +212,21 @@ std::vector<ValueRef> subtensor_rule(
const Subtensor& subtensor, Span<ValueRef> inputs) {
mgb_assert(inputs.size() >= 1);
auto input = inputs[0];
size_t ndim = input.is<ScalarValue>() ? 0 : input.shape()->ndim;
for (auto&& [axis, begin, end, step, idx] : subtensor.items) {
if (idx) {
ndim--;
bool is_scalar;
mgb_assert(!input.is<ScalarValue>(), "subtensor shouldn't have scalar input");
if (auto shape = input.shape()) {
size_t ndim = input.shape()->ndim;
for (auto&& [axis, begin, end, step, idx] : subtensor.items) {
if (idx) {
ndim--;
}
}
is_scalar = ndim == 0;
} else {
is_scalar = false;
}
auto output = imperative::apply(subtensor, unwrap_inputs(inputs))[0];
if (!ndim) {
if (is_scalar) {
return {ScalarValue::make(output)};
} else {
return {output};
@@ -261,8 +269,7 @@ std::vector<ValueRef> fastpath_copy_rule(

std::vector<ValueRef> reshape_rule(const Reshape& reshape, Span<ValueRef> inputs) {
mgb_assert(inputs.size() == 2);
bool is_scalar =
(!inputs[1].is<ScalarValue>()) && *inputs[1].shape() == ValueShape{0};
bool is_scalar = is_scalar_shape(inputs[1]);
auto unwrapped_input = inputs[0].is<ScalarValue>()
? inputs[0].cast<ScalarValue>().value()
: inputs[0];


+ 1
- 1
src/core/include/megbrain/version.h View File

@@ -15,7 +15,7 @@

#define MGE_MAJOR 1
#define MGE_MINOR 8
#define MGE_PATCH 0
#define MGE_PATCH 2

// for rc version, could be like "rc1", "rc2", etc
#define MGE_EXTRA_NAME ""


+ 1
- 0
src/megbrain_build_config.h.in View File

@@ -57,6 +57,7 @@
#cmakedefine01 MEGDNN_64_BIT
#cmakedefine01 MEGDNN_THREADS_512
#cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS
#cmakedefine01 MEGDNN_WITH_BENCHMARK

// whether atlas is available
#ifndef MGB_ATLAS


+ 1
- 1
third_party/cutlass

@@ -1 +1 @@
Subproject commit 31798848e40c2752d4b3db193491a63b77455029
Subproject commit 5a639171ed8489d877a56bfd28cfd9e6df8c5403

Loading…
Cancel
Save