feat(dnn): add rrconv wgrad, support int32 and uint8 region mask

GitOrigin-RevId: 0da9b3bca8
2 years ago · 4e9b1c4eee
--- a/dnn/scripts/cutlass_generator/BUILD
+++ b/dnn/scripts/cutlass_generator/BUILD
@@ -19,6 +19,7 @@ genrule(
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations rrconv2d_wgrad --type simt $(@D)
          """,
    tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"],
    visibility = ["//visibility:public"],
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
@@ -35,6 +35,8 @@ class Conv2dOperation:
        without_shared_load=False,
        required_cuda_ver_major=9,
        required_cuda_ver_minor=2,
        rin=None,
        rout=None,
    ):
        self.operation_kind = OperationKind.Conv2d
@@ -54,6 +56,8 @@ class Conv2dOperation:
        self.without_shared_load = without_shared_load
        self.required_cuda_ver_major = required_cuda_ver_major
        self.required_cuda_ver_minor = required_cuda_ver_minor
        self.rin = rin
        self.rout = rout
    #
    def accumulator_type(self):
@@ -95,6 +99,8 @@ class Conv2dOperation:
        conv_type_name = ""
        if self.conv_type == ConvType.DepthwiseConvolution:
            conv_type_name = "dw"
        elif self.conv_type == ConvType.RegionRestrictedConvolution:
            conv_type_name = "rr"
        return "%s%s%s%s%s%s%s_%s" % (
            ShortDataTypeNames[self.accumulator_type()],
@@ -125,6 +131,9 @@ class Conv2dOperation:
            elif self.src.element == self.flt.element:
                extended_name = "${core_name}_${element_src}"
        if self.rin != None:
            extended_name += "_${element_rin}"
        extended_name = SubstituteTemplate(
            extended_name,
            {
@@ -132,6 +141,7 @@ class Conv2dOperation:
                "element_flt": DataTypeNames[self.flt.element],
                "element_dst": DataTypeNames[self.dst.element],
                "core_name": self.core_name(),
                "element_rin": DataTypeNames[self.rin.element],
            },
        )
@@ -512,6 +522,115 @@ using Convolution_${operation_name} =
        return SubstituteTemplate(self.template, values)
 class EmitRegionRestrictedConvolutionBackwardFilterInstance:
    def __init__(self):
        self.template = """
 // kernel instance "${operation_name}" generated by cutlass generator
 using Convolution_${operation_name} = 
  typename cutlass::conv::device::RegionRestrictedConvolutionBackwardFilter<
    ${element_src},
    ${layout_src},
    ${element_diff},
    ${layout_diff},
    ${element_src_mask},
    ${layout_src_mask},
    ${element_output_mask},
    ${layout_output_mask},
    ${element_grad}, 
    ${layout_grad},
    ${element_accumulator}, 
    ${conv_type},
    ${opcode_class},
    ${arch},
    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
    ${epilogue_functor}<
      ${element_grad},
      ${epilogue_vector_length},
      ${element_accumulator}, 
      ${element_epilogue}
    >,
    ${swizzling_functor},     
    ${stages},
    ${alignment_src},
    ${alignment_diff},
    ${alignment_src_mask},
    ${alignment_output_mask},
    ${special_optimization}, 
    ${math_operator},
    ${implicit_gemm_mode}>;
 """
    def emit(self, operation):
        warp_shape = [
            int(
                operation.tile_description.threadblock_shape[idx]
                / operation.tile_description.warp_count[idx]
            )
            for idx in range(3)
        ]
        epilogue_vector_length = int(
            min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128)
            / DataTypeSize[operation.dst.element]
        )
        values = {
            "operation_name": operation.procedural_name(),
            "conv_type": ConvTypeTag[operation.conv_type],
            "element_src": DataTypeTag[operation.src.element],
            "layout_src": LayoutTag[operation.src.layout],
            "element_diff": DataTypeTag[operation.flt.element],
            "layout_diff": LayoutTag[operation.flt.layout],
            "element_src_mask": DataTypeTag[operation.rin.element],
            "layout_src_mask": LayoutTag[operation.rin.layout],
            "element_output_mask": DataTypeTag[operation.rout.element],
            "layout_output_mask": LayoutTag[operation.rout.layout],
            "element_grad": DataTypeTag[operation.dst.element],
            "layout_grad": LayoutTag[operation.dst.layout],
            "element_accumulator": DataTypeTag[operation.accumulator_type()],
            "opcode_class": OpcodeClassTag[
                operation.tile_description.math_instruction.opcode_class
            ],
            "arch": "cutlass::arch::Sm%d" % operation.arch,
            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
            "warp_shape_m": str(warp_shape[0]),
            "warp_shape_n": str(warp_shape[1]),
            "warp_shape_k": str(warp_shape[2]),
            "instruction_shape_m": str(
                operation.tile_description.math_instruction.instruction_shape[0]
            ),
            "instruction_shape_n": str(
                operation.tile_description.math_instruction.instruction_shape[1]
            ),
            "instruction_shape_k": str(
                operation.tile_description.math_instruction.instruction_shape[2]
            ),
            "epilogue_vector_length": str(epilogue_vector_length),
            "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor],
            "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
            "stages": str(operation.tile_description.stages),
            "alignment_src": str(operation.src.alignment),
            "alignment_diff": str(operation.flt.alignment),
            "alignment_src_mask": str(operation.rin.alignment),
            "alignment_output_mask": str(operation.rout.alignment),
            "special_optimization": SpecialOptimizeDescTag[
                operation.special_optimization
            ],
            "math_operator": MathOperationTag[
                operation.tile_description.math_instruction.math_operation
            ],
            "implicit_gemm_mode": ImplicitGemmModeTag[operation.implicit_gemm_mode],
        }
        return SubstituteTemplate(self.template, values)
 ###################################################################################################
 #
 # Generator functions for all layouts
@@ -540,7 +659,10 @@ def GenerateConv2d(
    operations = []
    element_epilogue = DataType.f32
    if conv_type == ConvType.DepthwiseConvolution:
    if (
        conv_type == ConvType.DepthwiseConvolution
        or conv_type == ConvType.RegionRestrictedConvolution
    ):
        if conv_kind == ConvKind.Fprop:
            swizzling_functor = SwizzlingFunctor.DepthwiseConvolutionFprop
        elif conv_kind == ConvKind.Dgrad:
@@ -680,6 +802,16 @@ def GenerateConv2d(
                flt_layout,
                int(flt_align / DataTypeSize[tile.math_instruction.element_a]),
            )
            rin = TensorDescription(
                tile.math_instruction.element_rin,
                src_layout,
                int(src_align / DataTypeSize[tile.math_instruction.element_rin]),
            )
            rout = TensorDescription(
                tile.math_instruction.element_rout,
                dst_layout,
                int(dst_align / DataTypeSize[tile.math_instruction.element_rout]),
            )
            bias = TensorDescription(
                bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))
            )
@@ -704,6 +836,8 @@ def GenerateConv2d(
                without_shared_load,
                required_cuda_ver_major,
                required_cuda_ver_minor,
                rin,
                rout,
            )
            operations.append(new_operation)
            if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt:
@@ -724,6 +858,8 @@ def GenerateConv2d(
                    without_shared_load,
                    required_cuda_ver_major,
                    required_cuda_ver_minor,
                    rin,
                    rout,
                )
                operations.append(new_operation)
    return operations
@@ -955,5 +1091,89 @@ void initialize_${operation_name}(Manifest &manifest) {
        self.kernel_file.close()
 class EmitRegionRestrictedConvSingleKernelWrapper:
    def __init__(self, kernel_path, operation, short_path=False):
        self.kernel_path = kernel_path
        self.operation = operation
        self.short_path = short_path
        # Now only support wgrad
        assert self.operation.conv_kind == ConvKind.Wgrad
        self.instance_emitter = EmitRegionRestrictedConvolutionBackwardFilterInstance()
        self.convolution_name = "RegionRestrictedConvolutionBackwardFilterOperation"
        self.header_template = """
 #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #include "cutlass/convolution/device/convolution.h"
 #include "src/cuda/cutlass/manifest.h"
 #include "src/cuda/cutlass/convolution_operation.h"
 """
        self.instance_template = """
 ${operation_instance}
 """
        self.manifest_template = """
 namespace cutlass {
 namespace library {
 void initialize_${operation_name}(Manifest &manifest) {
  manifest.append(new ${convolution_name}<Convolution_${operation_name}>(
    "${operation_name}"
  ));
 }
 }  // namespace library
 }  // namespace cutlass
 """
        self.epilogue_template = """
 #pragma GCC diagnostic pop
 #endif
 """
    #
    def __enter__(self):
        if self.short_path:
            self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % GlobalCnt.cnt)
            GlobalCnt.cnt += 1
        else:
            self.kernel_path = os.path.join(
                self.kernel_path, "%s.cu" % self.operation.procedural_name()
            )
        self.kernel_file = open(self.kernel_path, "w")
        return self
    #
    def emit(self):
        self.kernel_file.write(
            SubstituteTemplate(
                self.instance_template,
                {"operation_instance": self.instance_emitter.emit(self.operation)},
            )
        )
        # emit manifest helper
        manifest = SubstituteTemplate(
            self.manifest_template,
            {
                "operation_name": self.operation.procedural_name(),
                "convolution_name": self.convolution_name,
            },
        )
        self.kernel_file.write(manifest)
    #
    def __exit__(self, exception_type, exception_value, traceback):
        self.kernel_file.close()
 ###################################################################################################
 ###################################################################################################
--- a/dnn/scripts/cutlass_generator/gen_list.py
+++ b/dnn/scripts/cutlass_generator/gen_list.py
@@ -64,4 +64,5 @@ if __name__ == "__main__":
        write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4)
        write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2)
        write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4)
        write_merge_file_name(f, "rrconv2d_wgrad", "simt", 2)
        f.write("]")
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -1260,6 +1260,218 @@ def GenerateDwconv2d_Simt(args, conv_kind):
    return operations
 def GenerateRegionRestrictedconv2d_Simt(args, conv_kind):
    ################################################################################
    # warps per threadblock
    ################################################################################
    warpsPerThreadblocks = []
    for warpsPerThreadblock0 in warpsPerThreadblockEdge:
        for warpsPerThreadblock1 in warpsPerThreadblockEdge:
            if (
                warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio
                and warpsPerThreadblock1 / warpsPerThreadblock0
                <= warpsPerThreadblockRatio
                and warpsPerThreadblock0 * warpsPerThreadblock1
                <= warpsPerThreadblockMax
            ):
                warpsPerThreadblocks.append(
                    [warpsPerThreadblock0, warpsPerThreadblock1]
                )
    ################################################################################
    # warp shapes
    ################################################################################
    warpNumThreads = 32
    warpShapes = []
    for warp0 in warpShapeEdges:
        for warp1 in warpShapeEdges:
            if (
                warp0 / warp1 <= warpShapeRatio
                and warp1 / warp0 <= warpShapeRatio
                and warp0 * warp1 <= warpShapeMax
                and warp0 * warp1 > warpShapeMin
            ):
                warpShapes.append([warp0, warp1])
    # sgemm
    (
        precisionType,
        precisionBits,
        threadblockMaxElements,
        threadblockTilesL0,
    ) = precisions["s"]
    layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)]
    math_instructions = [
        MathInstruction(
            [1, 1, 1],
            DataType.f32,
            DataType.f32,
            DataType.f32,
            OpcodeClass.Simt,
            MathOperation.multiply_add,
            DataType.s32,
            DataType.s32,
        ),
        MathInstruction(
            [1, 1, 1],
            DataType.f32,
            DataType.f32,
            DataType.f32,
            OpcodeClass.Simt,
            MathOperation.multiply_add,
            DataType.s8,
            DataType.s8,
        ),
    ]
    min_cc = 50
    max_cc = 1024
    dst_layouts = [LayoutType.TensorNCHW]
    dst_types = [DataType.f32]
    if conv_kind == ConvKind.Wgrad:
        alignment_constraints = [32]
    else:
        alignment_constraints = [128, 32]
    operations = []
    for math_inst in math_instructions:
        tile_descriptions = [
            TileDescription([128, 128, 8], 1, [4, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128, 64, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([64, 128, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128, 32, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([32, 128, 8], 1, [1, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([64, 64, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([32, 64, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([64, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc),
            TileDescription([32, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc),
        ]
        for warpsPerThreadblock in warpsPerThreadblocks:
            for warpShape in warpShapes:
                warpThreadsM = 0
                if warpShape[0] > warpShape[1]:
                    warpThreadsM = 8
                else:
                    warpThreadsM = 4
                warpThreadsN = warpNumThreads / warpThreadsM
                # skip shapes with conflicting rectangularity
                # they are unlikely to be fastest
                blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1]
                blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1]
                warpG = warpShape[0] > warpShape[1]
                warpL = warpShape[0] < warpShape[1]
                blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1] * 2
                blockL2 = warpsPerThreadblock[0] * 2 < warpsPerThreadblock[1]
                warpG2 = warpShape[0] > warpShape[1] * 2
                warpL2 = warpShape[0] * 2 < warpShape[1]
                if blockG2 and warpL:
                    continue
                if blockL2 and warpG:
                    continue
                if warpG2 and blockL:
                    continue
                if warpL2 and blockG:
                    continue
                # check threadblock ratios and max
                threadblockTile = [
                    warpShape[0] * warpsPerThreadblock[0],
                    warpShape[1] * warpsPerThreadblock[1],
                ]
                if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements:
                    continue
                if threadblockTile[0] > threadblockEdgeMax:
                    continue
                if threadblockTile[1] > threadblockEdgeMax:
                    continue
                totalThreads = (
                    warpNumThreads * warpsPerThreadblock[0] * warpsPerThreadblock[1]
                )
                # calculate unroll
                # ensure that every iteration at least a full load of A,B are done
                unrollMin = 8
                unrollMin0 = totalThreads // threadblockTile[0]
                unrollMin1 = totalThreads // threadblockTile[1]
                unroll = max(unrollMin, unrollMin0, unrollMin1)
                threadTileM = warpShape[0] // warpThreadsM
                threadTileN = warpShape[1] // warpThreadsN
                if threadTileM < 2 or threadTileN < 2:
                    continue
                if threadTileM * threadTileN * precisionBits > 8 * 8 * 32:
                    continue
                # epilogue currently only supports N < WarpNumThreads
                if threadblockTile[1] < warpNumThreads:
                    continue
                # limit smem
                smemBitsA = threadblockTile[0] * unroll * 2 * precisionBits
                smemBitsB = threadblockTile[1] * unroll * 2 * precisionBits
                smemKBytes = (smemBitsA + smemBitsB) / 8 / 1024
                if smemKBytes > 48:
                    continue
                tile = TileDescription(
                    [threadblockTile[0], threadblockTile[1], unroll],
                    1,
                    [
                        threadblockTile[0] // warpShape[0],
                        threadblockTile[1] // warpShape[1],
                        1,
                    ],
                    math_inst,
                    min_cc,
                    max_cc,
                )
                def filter(t: TileDescription) -> bool:
                    nonlocal tile
                    return (
                        t.threadblock_shape[0] == tile.threadblock_shape[0]
                        and t.threadblock_shape[1] == tile.threadblock_shape[1]
                        and t.threadblock_shape[2] == tile.threadblock_shape[2]
                        and t.warp_count[0] == tile.warp_count[0]
                        and t.warp_count[1] == tile.warp_count[1]
                        and t.warp_count[2] == tile.warp_count[2]
                        and t.stages == tile.stages
                    )
                if not any(t for t in tile_descriptions if filter(t)):
                    continue
                for layout in layouts:
                    for dst_type, dst_layout in zip(dst_types, dst_layouts):
                        for alignment_src in alignment_constraints:
                            operations += GenerateConv2d(
                                ConvType.RegionRestrictedConvolution,
                                conv_kind,
                                [tile],
                                layout[0],
                                layout[1],
                                dst_layout,
                                dst_type,
                                min_cc,
                                alignment_src,
                                32,
                                32,
                                SpecialOptimizeDesc.NoneSpecialOpt,
                                ImplicitGemmMode.GemmNT
                                if conv_kind == ConvKind.Wgrad
                                else ImplicitGemmMode.GemmTN,
                            )
    return operations
 #
 def GenerateDwconv2d_TensorOp_884(args, conv_kind):
    layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)]
@@ -1644,6 +1856,14 @@ def GenerateDwconv2dWgradOperations(args):
        return GenerateDwconv2d_TensorOp_884(args, ConvKind.Wgrad)
 def GenerateRegionRestrictedconv2dWgradOperations(args):
    assert args.type == "simt", (
        "operation RegionRestrictedconv2d wgrad only support"
        "simt. (got:{})".format(args.type)
    )
    return GenerateRegionRestrictedconv2d_Simt(args, ConvKind.Wgrad)
 def GenerateGemmOperations(args):
    if args.type == "tensorop884":
        return GeneratesGemm_TensorOp_884(args)
@@ -1698,6 +1918,8 @@ def ConcatFile(
        sub_string_1 = sub_string_2 = "simt"
    if "dwconv2d_" in operations:
        filtered_operations = operations[:2] + operations[9:]
    if "rrconv2d_" in operations:
        filtered_operations = operations[:2] + operations[9:]
    elif ("conv2d" in operations) or ("deconv" in operations):
        filtered_operations = "cutlass"
    else:
@@ -1893,6 +2115,7 @@ if __name__ == "__main__":
            "dwconv2d_fprop",
            "dwconv2d_dgrad",
            "dwconv2d_wgrad",
            "rrconv2d_wgrad",
        ],
        required=True,
        help="Specifies the operation to generate (gemm, gemv, conv2d, deconv, dwconv2d_fprop, dwconv2d_dgrad, dwconv2d_wgrad)",
@@ -1928,9 +2151,11 @@ if __name__ == "__main__":
        operations = GenerateDwconv2dFpropOperations(args)
    elif args.operations == "dwconv2d_dgrad":
        operations = GenerateDwconv2dDgradOperations(args)
    else:
        assert args.operations == "dwconv2d_wgrad", "invalid operation"
    elif args.operations == "dwconv2d_wgrad":
        operations = GenerateDwconv2dWgradOperations(args)
    else:
        assert args.operations == "rrconv2d_wgrad", "invalid operation"
        operations = GenerateRegionRestrictedconv2dWgradOperations(args)
    if (
        args.operations == "conv2d"
@@ -1974,6 +2199,42 @@ if __name__ == "__main__":
                required_cuda_ver_minor,
                epilogue,
            )
    elif args.operations == "rrconv2d_wgrad":
        for operation in operations:
            with EmitRegionRestrictedConvSingleKernelWrapper(
                args.output, operation, short_path
            ) as emitter:
                emitter.emit()
        head = EmitRegionRestrictedConvSingleKernelWrapper(
            args.output, operations[0], short_path
        ).header_template
        required_cuda_ver_major = operations[0].required_cuda_ver_major
        required_cuda_ver_minor = operations[0].required_cuda_ver_minor
        epilogue = EmitRegionRestrictedConvSingleKernelWrapper(
            args.output, operations[0], short_path
        ).epilogue_template
        if "tensorop" in args.type:
            ConcatFile(
                4,
                args.output,
                args.operations,
                args.type,
                head,
                required_cuda_ver_major,
                required_cuda_ver_minor,
                epilogue,
            )
        else:
            ConcatFile(
                2,
                args.output,
                args.operations,
                args.type,
                head,
                required_cuda_ver_major,
                required_cuda_ver_minor,
                epilogue,
            )
    elif args.operations == "gemm":
        for operation in operations:
            with EmitGemmSingleKernelWrapper(
--- a/dnn/scripts/cutlass_generator/library.py
+++ b/dnn/scripts/cutlass_generator/library.py
@@ -532,6 +532,7 @@ class ConvType(enum.Enum):
    Local = enum_auto()
    LocalShare = enum_auto()
    DepthwiseConvolution = enum_auto()
    RegionRestrictedConvolution = enum_auto()
 ConvTypeTag = {
@@ -540,6 +541,8 @@ ConvTypeTag = {
    ConvType.Local: "cutlass::conv::ConvType::kLocal",
    ConvType.LocalShare: "cutlass::conv::ConvType::kLocalShare",
    ConvType.DepthwiseConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution",
    # RegionRestrictedConvolution using the same conv type with Depthwise
    ConvType.RegionRestrictedConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution",
 }
 #
@@ -640,6 +643,8 @@ class MathInstruction:
        element_accumulator,
        opcode_class,
        math_operation=MathOperation.multiply_add,
        element_rin=DataType.s32,
        element_rout=DataType.s32,
    ):
        self.instruction_shape = instruction_shape
        self.element_a = element_a
@@ -647,6 +652,8 @@ class MathInstruction:
        self.element_accumulator = element_accumulator
        self.opcode_class = opcode_class
        self.math_operation = math_operation
        self.element_rin = element_rin
        self.element_rout = element_rout
 #
--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
@@ -85,4 +85,7 @@ cutlass_gen_list = [
    "dwconv2d_wgrad_tensorop884_2.cu",
    "dwconv2d_wgrad_tensorop884_3.cu",
    "all_dwconv2d_wgrad_tensorop884_operations.cu",
    "rrconv2d_wgrad_simt_0.cu",
    "rrconv2d_wgrad_simt_1.cu",
    "all_rrconv2d_wgrad_simt_operations.cu",
 ]
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -188,6 +188,7 @@ if(MGE_WITH_CUDA)
  gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES)
  gen_cutlass_kimpl(rrconv2d_wgrad simt CUTLASS_SOURCES)
  list(PREPEND CUSOURCES ${CUTLASS_SOURCES})
  # Compile the following file first, the priority_compile_opr.txt is generated by
--- a/dnn/src/cuda/cutlass/convolution_operation.h
+++ b/dnn/src/cuda/cutlass/convolution_operation.h
@@ -452,6 +452,86 @@ public:
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename Operator_>
 class RegionRestrictedConvolutionBackwardFilterOperation
        : public ConvolutionBackwardFilterOperationBase<Operator_> {
 public:
    using Operator = Operator_;
    using ElementSrc = typename Operator::ElementSrc;
    using LayoutSrc = typename Operator::LayoutSrc;
    using ElementDiff = typename Operator::ElementDiff;
    using LayoutDiff = typename Operator::LayoutDiff;
    using ElementGrad = typename Operator::ElementGrad;
    using LayoutGrad = typename Operator::LayoutGrad;
    using ElementAccumulator = typename Operator::ElementAccumulator;
    using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
    using OperatorArguments = typename Operator::Arguments;
    using ElementRin = typename Operator::ElementMaskInput;
    using LayoutRin = typename Operator::LayoutMaskInput;
    using ElementRout = typename Operator::ElementMaskOutput;
    using LayoutRout = typename Operator::LayoutMaskOutput;
    RegionRestrictedConvolutionBackwardFilterOperation(
            char const* name = "unknown_gemm")
            : ConvolutionBackwardFilterOperationBase<Operator_>(name) {
        /// rin in description -> rin in C++ template
        this->m_description.rin = make_TensorDescription<ElementRin, LayoutRin>(
                Operator::kAlignmentMaskInput);
        /// rout in description -> rout in C++ template
        this->m_description.rout = make_TensorDescription<ElementRout, LayoutRout>(
                Operator::kAlignmentMaskOutput);
        this->m_description.without_shared_load = false;
    }
    virtual Status run(
            void const* arguments_ptr, void* device_workspace = nullptr,
            cudaStream_t stream = nullptr) const {
        cutlass::conv::Operator conv_op = this->m_description.conv_op;
        ConvolutionArguments const* conv_args =
                reinterpret_cast<ConvolutionArguments const*>(arguments_ptr);
        const auto& ps = conv_args->problem_size;
        OperatorArguments args;
        args.problem_size = ps;
        /// src in convolution arguments -> ref_src
        args.ref_src = {
                static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)),
                LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))};
        /// filter in convolution arguments -> ref_diff
        args.ref_diff = {
                static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)),
                LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))};
        /// dst in convolution arguments -> ref_grad
        args.ref_grad = {
                static_cast<ElementGrad*>(conv_args->dst),
                LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};
        /// rin in convolution arguments -> ref_mask_input
        args.ref_mask_input = {
                static_cast<ElementRin*>(const_cast<void*>(conv_args->rin)),
                LayoutRin::packed(implicit_gemm_tensor_rin_extent(conv_op, ps))};
        /// rout in convolution arguments -> ref_mask_output
        args.ref_mask_output = {
                static_cast<ElementRout*>(const_cast<void*>(conv_args->rout)),
                LayoutRout::packed(implicit_gemm_tensor_rout_extent(conv_op, ps))};
        args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get(
                conv_args);
        Operator op;
        Status status = op.initialize(args, device_workspace);
        if (status != Status::kSuccess) {
            return status;
        }
        return op.run(stream);
    }
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 }  // namespace library
 }  // namespace cutlass
--- a/dnn/src/cuda/cutlass/initialize_all.cu
+++ b/dnn/src/cuda/cutlass/initialize_all.cu
@@ -50,6 +50,7 @@ void initialize_all_deconv_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest);
 void initialize_all_rrconv2d_wgrad_simt_operations(Manifest& manifest);
 #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
 void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
 void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest);
@@ -70,6 +71,7 @@ void initialize_all(Manifest& manifest) {
    initialize_all_dwconv2d_fprop_simt_operations(manifest);
    initialize_all_dwconv2d_dgrad_simt_operations(manifest);
    initialize_all_dwconv2d_wgrad_simt_operations(manifest);
    initialize_all_rrconv2d_wgrad_simt_operations(manifest);
 #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
    initialize_all_gemm_tensorop884_operations(manifest);
    initialize_all_dwconv2d_fprop_tensorop884_operations(manifest);
--- a/dnn/src/cuda/cutlass/library.h
+++ b/dnn/src/cuda/cutlass/library.h
@@ -471,6 +471,10 @@ struct ConvolutionDescription : public OperationDescription {
    conv::SpecialOptimizeDesc special_optimization;
    conv::ImplicitGemmMode gemm_mode;
    bool without_shared_load;
    // only used by rrconv
    TensorDescription rin;
    TensorDescription rout;
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -499,6 +503,10 @@ struct ConvolutionArguments {
    /// Host pointer to extra param struct
    void const* extra_param;
    // only used by rrconv, default: nullptr
    void const* rin = nullptr;
    void const* rout = nullptr;
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/dnn/src/cuda/cutlass/operation_table.cpp
+++ b/dnn/src/cuda/cutlass/operation_table.cpp
@@ -118,6 +118,11 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
    key.alignment_filter = desc.filter.alignment;
    key.without_shared_load = desc.without_shared_load;
    key.element_rin = desc.rin.element;
    key.layout_rin = desc.rin.layout;
    key.element_rout = desc.rout.element;
    key.layout_rout = desc.rout.layout;
    return key;
 }
--- a/dnn/src/cuda/cutlass/operation_table.h
+++ b/dnn/src/cuda/cutlass/operation_table.h
@@ -201,6 +201,12 @@ struct ConvolutionKey {
    bool without_shared_load;
    // only used by rrconv
    library::NumericTypeID element_rin = library::NumericTypeID::kInvalid;
    library::LayoutTypeID layout_rin = library::LayoutTypeID::kInvalid;
    library::NumericTypeID element_rout = library::NumericTypeID::kInvalid;
    library::LayoutTypeID layout_rout = library::LayoutTypeID::kInvalid;
    inline bool operator==(ConvolutionKey const& rhs) const {
        return (conv_op == rhs.conv_op) && (element_src == rhs.element_src) &&
               (layout_src == rhs.layout_src) &&
@@ -223,7 +229,9 @@ struct ConvolutionKey {
               (special_optimization == rhs.special_optimization) &&
               (alignment_src == rhs.alignment_src) &&
               (alignment_filter == rhs.alignment_filter) &&
               (without_shared_load == rhs.without_shared_load);
               (without_shared_load == rhs.without_shared_load) &&
               (element_rin == rhs.element_rin) && (layout_rin == rhs.layout_rin) &&
               (element_rout == rhs.element_rout) && (layout_rout == rhs.layout_rout);
    }
    inline bool operator!=(ConvolutionKey const& rhs) const { return !(*this == rhs); }
@@ -260,7 +268,11 @@ struct ConvolutionKey {
               "\n    special_optimization: " + to_string(special_optimization) +
               "\n    alignment_src: " + std::to_string(alignment_src) +
               "\n    alignment_filter: " + std::to_string(alignment_filter) +
               "\n    without_shared_load: " + to_string(without_shared_load) + "\n}";
               "\n    without_shared_load: " + to_string(without_shared_load) +
               "\n    element_rin: " + to_string(element_rin) +
               "\n    layout_rin: " + to_string(layout_rin) +
               "\n    element_rout: " + to_string(element_rout) +
               "\n    layout_rout: " + to_string(layout_rout) + "\n}";
    }
 };
@@ -293,6 +305,10 @@ struct ConvolutionKeyHasher {
                .update(&key.alignment_src, sizeof(key.alignment_src))
                .update(&key.alignment_filter, sizeof(key.alignment_filter))
                .update(&key.without_shared_load, sizeof(key.without_shared_load))
                .update(&key.element_rin, sizeof(key.element_rin))
                .update(&key.layout_rin, sizeof(key.layout_rin))
                .update(&key.element_rout, sizeof(key.element_rout))
                .update(&key.layout_rout, sizeof(key.layout_rout))
                .digest();
    }
 };
--- a/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
+++ b/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
@@ -1,4 +1,5 @@
 #include "src/cuda/region_restricted_convolution/opr_impl.h"
 #include "src/cuda/cutlass/singleton.h"
 #include "src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter.cuh"
 #include "src/cuda/region_restricted_convolution/chanwise/kern.cuh"
 #include "src/cuda/utils.h"
@@ -6,6 +7,7 @@
 using namespace megdnn;
 using namespace cuda;
 using namespace region_restricted_convolution;
 using namespace cutlass::library;
 /* ============== RegionRestrictedConvolutionForwardImpl ============== */
 void RegionRestrictedConvolutionForwardImpl::exec(
@@ -113,7 +115,137 @@ size_t RegionRestrictedConvolutionBackwardFilterImpl::get_workspace_in_bytes(
 void RegionRestrictedConvolutionBackwardFilterImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_in rin,
        _megdnn_tensor_in rout, _megdnn_tensor_out grad, _megdnn_workspace workspace) {
    megdnn_throw("Region Restricted Conv BackwardFilter unimplemented");
    auto fm = check_exec(
            src.layout, diff.layout, rin.layout, rout.layout, grad.layout,
            workspace.size);
    megdnn_assert(
            fm.group > 1 && src.layout.dtype.category() == DTypeCategory::FLOAT &&
            param().compute_mode == Param::ComputeMode::DEFAULT &&
            fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 &&
            fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip &&
            param().stride_h == 1 && param().stride_w == 1);
    int hi = src.layout.operator[](2), wi = src.layout.operator[](3);
    int n = diff.layout.operator[](0), ho = diff.layout.operator[](2),
        wo = diff.layout.operator[](3);
    int co = fm.group, ci = co, groups = co;
    int fh = fm.spatial[0], fw = fm.spatial[1];
    int sh = fm.stride[0], sw = fm.stride[1];
    int ph = fm.padding[0], pw = fm.padding[1];
    int dh = 0, dw = 0;
    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto stream = cuda_stream(handle());
    float alpha = 1.f;
    float beta = 0.f;
    ConvolutionKey key;
    int threadblock_shape_n = 128;
    int warp_shape_m = 32;
    int warp_shape_n = 64;
    if (grad.layout.operator[](3) % 8 < 4) {
        threadblock_shape_n = 64;
        warp_shape_m = 64;
        warp_shape_n = 32;
    }
    if (rin.layout.dtype == dtype::Int32() && rout.layout.dtype == dtype::Int32()) {
        key = {
                cutlass::conv::Operator::kWgrad,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                cutlass::conv::ConvType::kDepthwiseConvolution,
                128,
                threadblock_shape_n,
                8,
                warp_shape_m,
                warp_shape_n,
                8,
                1,
                1,
                1,
                cutlass::epilogue::EpilogueType::kLinearCombination,
                1,
                cutlass::conv::SpecialOptimizeDesc::NONE,
                1,
                1,
                false,
                NumericTypeID::kS32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kS32,
                LayoutTypeID::kTensorNCHW,
        };
    } else if (
            rin.layout.dtype == dtype::Uint8() && rout.layout.dtype == dtype::Uint8()) {
        key = {
                cutlass::conv::Operator::kWgrad,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kF32,
                cutlass::conv::ConvType::kDepthwiseConvolution,
                128,
                threadblock_shape_n,
                8,
                warp_shape_m,
                warp_shape_n,
                8,
                1,
                1,
                1,
                cutlass::epilogue::EpilogueType::kLinearCombination,
                1,
                cutlass::conv::SpecialOptimizeDesc::NONE,
                1,
                1,
                false,
                NumericTypeID::kS8,
                LayoutTypeID::kTensorNCHW,
                NumericTypeID::kS8,
                LayoutTypeID::kTensorNCHW,
        };
    } else {
        megdnn_throw(ssprintf(
                             "don't support region restricted type rin: %s, rout: %s",
                             rin.layout.dtype.name(), rout.layout.dtype.name())
                             .c_str());
    }
    const Operation* op =
            (const Operation*)Singleton::get().operation_table.find_op(key);
    cutlass::conv::Conv2dProblemSize problem_size{
            n,      hi, wi, ci, co, fh, fw, ho,
            wo,     ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
            1,       // split k slices, always 1
            groups,  // groups
    };
    cutlass::library::ConvolutionArguments conv_args{
            problem_size, src.raw_ptr(),  diff.raw_ptr(), nullptr,
            nullptr,      grad.raw_ptr(), &alpha,         &beta,
            nullptr,      nullptr,        nullptr,        nullptr,
            nullptr,      nullptr,        rin.raw_ptr(),  rout.raw_ptr()};
    cutlass_check(op->run(&conv_args, nullptr, stream));
    after_kernel_launch();
 }
 // vim: syntax=cpp.doxygen
--- a/dnn/test/cuda/region_restricted_convolution.cpp
+++ b/dnn/test/cuda/region_restricted_convolution.cpp
@@ -465,6 +465,206 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
    run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
 }
 TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
    require_compute_capability(7, 5);
    Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
    bencher.set_display(false);
    bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
            "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
    Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
    rr_bencher.set_display(false);
    ConvolutionBackwardFilter::Param param;
    param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
    param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
    RegionRestrictedConvolutionBackwardFilter::Param rr_param;
    rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
    rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
    UniformIntRNG r_rng{1, 3};
    auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
                         size_t fw, size_t sh, size_t sw, size_t nr_times) {
        param.pad_h = fh / 2;
        param.pad_w = fw / 2;
        param.stride_h = sh;
        param.stride_w = sw;
        rr_param.pad_h = fh / 2;
        rr_param.pad_w = fw / 2;
        rr_param.stride_h = sh;
        rr_param.stride_w = sw;
        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_dtype(4, dtype::Float32());
        bencher.proxy()->target_execution_policy = {};
        bencher.set_times(nr_times);
        rr_bencher.set_param(rr_param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Int32())
                .set_dtype(3, dtype::Int32());
        rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
        rr_bencher.set_times(nr_times);
        size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
        size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
        TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
                rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + diff.total_nr_elems() +
                                 grad.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;
        float rr_bandwith = static_cast<float>(
                                    src.total_nr_elems() + diff.total_nr_elems() +
                                    rin.total_nr_elems() + rout.total_nr_elems() +
                                    grad.total_nr_elems()) /
                            (1024 * 1024 * 1024) * 1e3;
        auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
        auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
        auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
        auto rr_ops =
                2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
        printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
               "src=%s, "
               "diff=%s, grad=%s\n"
               "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
               "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
               src.to_string().c_str(), diff.to_string().c_str(),
               grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
               bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
               time_in_ms / rr_time_in_ms);
    };
    run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
 }
 TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
    require_compute_capability(7, 5);
    Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
    bencher.set_display(false);
    bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
            "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
    Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
    rr_bencher.set_display(false);
    ConvolutionBackwardFilter::Param param;
    param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
    param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
    RegionRestrictedConvolutionBackwardFilter::Param rr_param;
    rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
    rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
    UniformIntRNG r_rng{1, 3};
    auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
                         size_t fw, size_t sh, size_t sw, size_t nr_times) {
        param.pad_h = fh / 2;
        param.pad_w = fw / 2;
        param.stride_h = sh;
        param.stride_w = sw;
        rr_param.pad_h = fh / 2;
        rr_param.pad_w = fw / 2;
        rr_param.stride_h = sh;
        rr_param.stride_w = sw;
        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_dtype(4, dtype::Float32());
        bencher.proxy()->target_execution_policy = {};
        bencher.set_times(nr_times);
        rr_bencher.set_param(rr_param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Uint8())
                .set_dtype(3, dtype::Uint8());
        rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
        rr_bencher.set_times(nr_times);
        size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
        size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
        TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
                rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + diff.total_nr_elems() +
                                 grad.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;
        float rr_bandwith = static_cast<float>(
                                    src.total_nr_elems() + diff.total_nr_elems() +
                                    rin.total_nr_elems() + rout.total_nr_elems() +
                                    grad.total_nr_elems()) /
                            (1024 * 1024 * 1024) * 1e3;
        auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
        auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
        auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
        auto rr_ops =
                2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
        printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
               "src=%s, "
               "diff=%s, grad=%s\n"
               "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
               "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
               src.to_string().c_str(), diff.to_string().c_str(),
               grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
               bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
               time_in_ms / rr_time_in_ms);
    };
    run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
    run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
 }
 #endif
 TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
@@ -585,6 +785,125 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
    }
 }
 TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
    Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
    for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
        auto run = [&checker, &dt](
                           size_t n, size_t g, size_t ih, size_t fh, size_t padding,
                           size_t stride) {
            RegionRestrictedConvolutionBackwardFilter::Param cur_param;
            cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
                    CROSS_CORRELATION;
            cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
                    ComputeMode::DEFAULT;
            cur_param.sparse =
                    RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
            checker.set_dtype(0, dtype::Float32())
                    .set_dtype(1, dtype::Float32())
                    .set_dtype(2, dt)
                    .set_dtype(3, dt);
            float scale = 64.f / sqrt(fh * fh);
            UniformFloatRNG rng(scale, 2 * scale);
            UniformIntRNG r_rng{1, 2};
            checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
                    3, &r_rng);
            cur_param.pad_h = cur_param.pad_w = padding;
            cur_param.stride_h = cur_param.stride_w = stride;
            size_t oh = (ih + 2 * padding - fh + 1) / stride;
            checker.set_param(cur_param).execs({
                    {n, g * 1, ih, ih},  // src
                    {n, g * 1, oh, oh},  // diff
                    {n, ih, ih},         // rin
                    {n, oh, oh},         // rout
                    {g, 1, 1, fh, fh}    // grad
            });
        };
        if (dt == dtype::Int32()) {
            run(4, 8, 32, 5, 5 / 2, 1);
            run(1, 2, 2, 2, 0, 1);
            run(1, 2, 3, 3, 0, 1);
            run(1, 2, 4, 4, 0, 1);
            run(1, 2, 5, 5, 0, 1);
            run(1, 2, 6, 6, 0, 1);
            run(1, 2, 7, 7, 0, 1);
        }
        run(4, 8, 32, 7, 7 / 2, 1);
        run(4, 8, 32, 9, 9 / 2, 1);
        run(4, 8, 32, 11, 11 / 2, 1);
        run(4, 8, 32, 13, 13 / 2, 1);
        run(4, 8, 32, 15, 15 / 2, 1);
        run(4, 8, 32, 17, 17 / 2, 1);
        run(4, 8, 32, 19, 19 / 2, 1);
        run(4, 8, 32, 21, 21 / 2, 1);
        run(4, 8, 32, 23, 23 / 2, 1);
        run(4, 8, 32, 25, 25 / 2, 1);
        run(4, 8, 32, 27, 27 / 2, 1);
        run(4, 8, 32, 29, 29 / 2, 1);
        run(4, 8, 32, 31, 31 / 2, 1);
    }
 }
 TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
    Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
    for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
        auto run = [&checker, &dt](
                           size_t n, size_t g, size_t ih, size_t fh, size_t padding,
                           size_t stride) {
            RegionRestrictedConvolutionBackwardFilter::Param cur_param;
            cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
                    CROSS_CORRELATION;
            cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
                    ComputeMode::DEFAULT;
            cur_param.sparse =
                    RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
            checker.set_dtype(0, dtype::Float32())
                    .set_dtype(1, dtype::Float32())
                    .set_dtype(2, dt)
                    .set_dtype(3, dt);
            float scale = 64.f / sqrt(fh * fh);
            UniformFloatRNG rng(scale, 2 * scale);
            UniformIntRNG r_rng{1, 1};
            checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
                    3, &r_rng);
            cur_param.pad_h = cur_param.pad_w = padding;
            cur_param.stride_h = cur_param.stride_w = stride;
            size_t oh = (ih + 2 * padding - fh + 1) / stride;
            checker.set_param(cur_param).execs({
                    {n, g * 1, ih, ih},  // src
                    {n, g * 1, oh, oh},  // diff
                    {n, ih, ih},         // rin
                    {n, oh, oh},         // rout
                    {g, 1, 1, fh, fh}    // grad
            });
        };
        if (dt == dtype::Int32()) {
            run(4, 8, 32, 5, 5 / 2, 1);
            run(1, 2, 2, 2, 0, 1);
            run(1, 2, 3, 3, 0, 1);
            run(1, 2, 4, 4, 0, 1);
            run(1, 2, 5, 5, 0, 1);
            run(1, 2, 6, 6, 0, 1);
            run(1, 2, 7, 7, 0, 1);
        }
        run(4, 8, 32, 7, 7 / 2, 1);
        run(4, 8, 32, 9, 9 / 2, 1);
        run(4, 8, 32, 11, 11 / 2, 1);
        run(4, 8, 32, 13, 13 / 2, 1);
        run(4, 8, 32, 15, 15 / 2, 1);
        run(4, 8, 32, 17, 17 / 2, 1);
        run(4, 8, 32, 19, 19 / 2, 1);
        run(4, 8, 32, 21, 21 / 2, 1);
        run(4, 8, 32, 23, 23 / 2, 1);
        run(4, 8, 32, 25, 25 / 2, 1);
        run(4, 8, 32, 27, 27 / 2, 1);
        run(4, 8, 32, 29, 29 / 2, 1);
        run(4, 8, 32, 31, 31 / 2, 1);
    }
 }
 }  // namespace test
 }  // namespace megdnn