From 9e020d23b40a21793ad0c760be4bca1699a7fa17 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 25 Jul 2022 13:58:39 +0800
Subject: [PATCH] feat(imperative,mgb): add RegionRestrictedConvolution
 megbrain and imperative opr

GitOrigin-RevId: c0106ade083560af4db1a6ef27a26e23daab7f68
---
 imperative/python/megengine/functional/nn.py       |  60 ++-
 imperative/python/megengine/module/__init__.py     |   1 +
 imperative/python/megengine/module/conv.py         | 172 ++++++
 .../python/test/unit/functional/test_functional.py | 173 +++++++
 imperative/src/impl/ops/convolution.cpp            | 171 ++++++
 imperative/tablegen/generated/hash.txt             |  10 +-
 imperative/tablegen/generated/opdef.cpp.inl        | 304 +++++++++++
 imperative/tablegen/generated/opdef.cpy.inl        | 576 +++++++++++++++++++++
 imperative/tablegen/generated/opdef.h.inl          |  52 ++
 imperative/tablegen/generated/opdef.py.inl         |  46 ++
 src/core/include/megbrain/ir/ops.td                |   5 +
 src/opr/impl/dnn/convolution.cpp                   | 272 ++++++++++
 src/opr/impl/dnn/dnn.sereg.h                       |   1 +
 src/opr/impl/dnn/dnn.sereg.v2.h                    |  28 +
 src/opr/include/megbrain/opr/dnn/convolution.h     |  80 +++
 src/opr/test/dnn/region_restricted_convolution.cpp | 196 +++++++
 16 files changed, 2140 insertions(+), 7 deletions(-)
 create mode 100644 src/opr/test/dnn/region_restricted_convolution.cpp

diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 61e5ba2d..97292fb7 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -91,6 +91,7 @@ __all__ = [
     "warp_affine",
     "warp_perspective",
     "pixel_shuffle",
+    "region_restricted_conv",
 ]
 
 
@@ -1213,10 +1214,10 @@ def layer_norm(
 ):
     r"""Applies layer normalization to the input. Support tensor of any shape as input.
     Reference: https://arxiv.org/pdf/1803.08494.pdf.
-    
+
     Args:
         inp: input tensor.
-        normalized_shape: the shape that you want to be normalizated 
+        normalized_shape: the shape that you want to be normalizated
         affine: whether to use weight and bias
         weight: must not be None when the affine is true
         bias: must not be None when the affine is true
@@ -1974,6 +1975,61 @@ def pixel_shuffle(inp: Tensor, upscale_factor: int) -> Tensor:
     return pixel_shuffle_cpp(inp, upscale_factor, _layerPixelShuffle_traceable)
 
 
+def region_restricted_conv(
+    inp: Tensor,
+    weight: Tensor,
+    rin: Tensor,
+    rout: Tensor,
+    stride: Union[int, Tuple[int, int, int]] = 1,
+    padding: Union[int, Tuple[int, int, int]] = 0,
+    dilation: Union[int, Tuple[int, int, int]] = 1,
+    groups: int = 1,
+    conv_mode: str = "cross_correlation",
+    compute_mode="default",
+) -> Tensor:
+    r"""Region Restricted convolution operation.
+
+    Refer to :class:`~.RegionRestrictedConv` for more information.
+
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        stride: stride of the 2D region restricted convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, depth, height, width)``. Default: 1
+        conv_mode: supports "cross_correlation". Default: "cross_correlation"
+
+    Returns:
+        output tensor.
+    """
+    assert conv_mode.lower() == "cross_correlation"
+
+    pad_h, pad_w = _expand_hw(padding)
+    stride_h, stride_w = _expand_hw(stride)
+    dilate_h, dilate_w = _expand_hw(dilation)
+
+    sparse_type = "dense" if groups == 1 else "group"
+    op = builtin.RegionRestrictedConvolution(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        mode=conv_mode,
+        compute_mode=compute_mode,
+        sparse=sparse_type,
+    )
+    (output,) = apply(op, inp, weight, rin, rout)
+    return output
+
+
 from .quantized import conv_bias_activation  # isort:skip
 from .loss import *  # isort:skip
 from .vision import *  # isort:skip
diff --git a/imperative/python/megengine/module/__init__.py b/imperative/python/megengine/module/__init__.py
index 2ffc2cc6..73de9058 100644
--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -14,6 +14,7 @@ from .conv import (
     ConvTranspose3d,
     DeformableConv2d,
     LocalConv2d,
+    RegionRestrictedConv,
 )
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .deformable_psroi_pooling import DeformablePSROIPooling
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
index 8188b652..26d47ec4 100644
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -12,6 +12,7 @@ from ..functional import (
     deformable_conv2d,
     local_conv2d,
     pad,
+    region_restricted_conv,
     relu,
 )
 from ..tensor import Parameter
@@ -982,3 +983,174 @@ class ConvTranspose3d(_ConvNd):
             self.output_padding,
             self.dilation,
         )
+
+
+class RegionRestrictedConv(_ConvNd):
+
+    r"""Applies a 2D RegionRestricted Convolution over an input tensor.
+
+    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
+    this layer generates an output of the size
+    :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` through the
+    process described as below:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) =
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+    where :math:`\star` is the valid 2D cross-correlation operator,
+    :math:`N` is batch size, :math:`C` denotes number of channels,
+    :math:`H` is height of input planes in pixels, and :math:`W` is
+    width in pixels.
+
+    In general, output feature maps' shapes can be inferred as follows:
+
+    input: :math:`(N, C_{\text{in}}, H_{\text{in}}, W_{\text{in}})`
+
+    output: :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` where
+
+    .. math::
+        \text{H}_{out} = \lfloor \frac{\text{H}_{in} + 2 * \text{padding[0]} -
+        \text{dilation[0]} * (\text{kernel_size[0]} - 1) - 1}{\text{stride[0]}} + 1 \rfloor
+
+    .. math::
+        \text{W}_{out} = \lfloor \frac{\text{W}_{in} + 2 * \text{padding[1]} -
+        \text{dilation[1]} * (\text{kernel_size[1]} - 1) - 1}{\text{stride[1]}} + 1 \rfloor
+
+    When `groups == in_channels` and `out_channels == K * in_channels`,
+    where K is a positive integer, this operation is also known as depthwise
+    convolution.
+
+    In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
+    a depthwise convolution with a depthwise multiplier `K`, can be constructed
+    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
+
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If kernel_size is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size)``.
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, height, width)``. Default: 1
+        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+        compute_mode: When set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+        padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
+            Refer to :class:`~.module.padding.Pad` for more information.
+
+    Note:
+        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+
+    Examples:
+        >>> import numpy as np
+        >>> import megengine as mge
+        >>> import megengine.module as M
+        >>> rrconv = M.RegionRestrictedConv(in_channels=2, out_channels=2, kernel_size=2, groups=2)
+        >>> inp = mge.tensor(np.random.randn(1, 2, 2, 2).astype(np.float32))
+        >>> rin = mge.tensor(np.random.randn(1, 2, 2).astype(np.int32))
+        >>> rout = mge.tensor(np.random.randn(1, 1, 1).astype(np.int32))
+        >>> oup = rrconv(inp, rin, rout)
+        >>> oup.numpy().shape
+        (1, 2, 1, 1)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        groups: int,
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        conv_mode: str = "cross_correlation",
+        compute_mode: str = "default",
+        padding_mode: str = "zeros",
+        **kwargs
+    ):
+        kernel_size = _pair_nonzero(kernel_size)
+        stride = _pair_nonzero(stride)
+        padding = _pair(padding)
+        dilation = _pair_nonzero(dilation)
+        self.conv_mode = conv_mode
+        self.compute_mode = compute_mode
+        self.padding_mode = padding_mode
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            0,
+            dilation,
+            groups,
+            False,
+            **kwargs,
+        )
+
+    def _get_fanin(self):
+        kh, kw = self.kernel_size
+        ic = self.in_channels
+        return kh * kw * ic
+
+    def _infer_weight_shape(self):
+        group = self.groups
+        ichl = self.in_channels
+        ochl = self.out_channels
+        kh, kw = self.kernel_size
+        if group == 1:
+            # Assume format is NCHW
+            return (ochl, ichl, kh, kw)
+
+        assert (
+            ichl % group == 0 and ochl % group == 0
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
+            ichl, ochl, group
+        )
+        # Assume format is NCHW
+        return (group, ochl // group, ichl // group, kh, kw)
+
+    def _infer_bias_shape(self):
+        # Assume format is NCHW
+        return (1, self.out_channels, 1, 1)
+
+    def get_pad_width(self):
+        return (
+            (0, 0),
+            (0, 0),
+            (self.padding[0], self.padding[0]),
+            (self.padding[1], self.padding[1]),
+        )
+
+    def calc_conv(self, inp, weight, rin, rout):
+        assert self.padding_mode in [
+            "zeros",
+            "reflect",
+            "replicate",
+        ]
+        return region_restricted_conv(
+            inp,
+            weight,
+            rin,
+            rout,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.conv_mode,
+            self.compute_mode,
+        )
+
+    def forward(self, inp, rin, rout):
+        return self.calc_conv(inp, self.weight, rin, rout)
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index 03428aaf..f9a02ee0 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -930,6 +930,179 @@ def test_batch_conv_bias():
     run(1, 4, 4, 5, 5, 3, 3, 0, 0, 1, 1, True)
 
 
+def test_region_restricted_conv_forward_backward_naive():
+    import megengine as mge
+    import megengine.module as M
+    from megengine.autodiff import GradManager
+
+    handle = "cpu0"
+    src_1 = np.arange(8).reshape(1, 2, 2, 2).astype(np.float32)
+    filter_1 = np.arange(8).reshape(2, 1, 1, 2, 2).astype(np.float32)
+    rin_1 = np.array([1, 1, 1, 1]).reshape(1, 2, 2).astype(np.int32)
+    rout_1 = np.array([1]).reshape(1, 1, 1).astype(np.int32)
+    cpu_src = tensor(src_1, device=handle)
+    cpu_filter = tensor(filter_1, device=handle)
+    gm = GradManager().attach([cpu_src, cpu_filter])
+    with gm:
+        cpu_out = F.region_restricted_conv(
+            cpu_src,
+            cpu_filter,
+            tensor(rin_1, device=handle),
+            tensor(rout_1, device=handle),
+            groups=2,
+        )
+        gm.backward(cpu_out, tensor(np.ones((1, 2, 1, 1)), device=handle))
+    np.testing.assert_allclose(
+        cpu_src.grad, np.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape(1, 2, 2, 2)
+    )
+    np.testing.assert_allclose(
+        cpu_filter.grad, np.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape(2, 1, 1, 2, 2)
+    )
+
+
+@pytest.mark.skipif(
+    not is_cuda_available(), reason="rrconv cuda kernel requires cuda available"
+)
+def test_region_restricted_conv_forward_backward_cuda():
+    import megengine as mge
+    import megengine.module as M
+    from megengine.autodiff import GradManager
+    import megengine.distributed as dist
+
+    # params
+    handle = "gpu0"
+    N = 1
+    GROUP = 3
+    FH = FW = 2
+    IH = IW = 2
+    OH = OW = 1
+    ICPG = OCPG = 1
+    grad_shape = (N, GROUP * ICPG, IH, IW)
+    src_shape = grad_shape
+    filter_shape = (GROUP, OCPG, ICPG, FH, FW)
+    diff_shape = (N, GROUP * OCPG, OH, OW)
+    rin_shape = (N, IH, IW)
+    rout_shape = (N, OH, OW)
+
+    def reduce(shape):
+        mul = 1
+        for x in shape:
+            mul *= x
+        return mul
+
+    def get_groundtruth():
+        src = tensor(
+            np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
+            device="cpu0",
+        )
+        filter = tensor(np.ones(filter_shape).astype(np.float32), device="cpu0")
+        rin = tensor(np.ones(rin_shape).astype(np.int32), device="cpu0")
+        rout = tensor(np.ones(rout_shape).astype(np.int32), device="cpu0")
+        gm = GradManager().attach([src, filter])
+        with gm:
+            expected_out = F.region_restricted_conv(
+                src, filter, rin, rout, groups=GROUP
+            )
+            gm.backward(
+                expected_out,
+                tensor(np.ones(diff_shape, dtype=np.float32), device="cpu0"),
+            )
+        return src, filter
+
+    expected_src, expected_filter = get_groundtruth()
+
+    src = tensor(
+        np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
+        device=handle,
+    )
+    filter = tensor(np.ones(filter_shape).astype(np.float32), device=handle)
+    rin = tensor(np.ones(rin_shape).astype(np.int32), device=handle)
+    rout = tensor(np.ones(rout_shape).astype(np.int32), device=handle)
+    gm = GradManager().attach([src, filter])
+    with gm:
+        gpu_out = F.region_restricted_conv(src, filter, rin, rout, groups=GROUP)
+        gm.backward(gpu_out, tensor(np.ones(diff_shape), device=handle))
+        np.testing.assert_allclose(src.grad, expected_src.grad)
+        np.testing.assert_allclose(filter.grad, expected_filter.grad)
+
+
+@pytest.mark.skipif(
+    not is_cuda_available(), reason="rrconv cuda kernel requires cuda available"
+)
+def test_region_restricted_conv_forward_backward_uint8():
+    import megengine as mge
+    import megengine.module as M
+    from megengine.autodiff import GradManager
+
+    # params
+    handle = "gpu0"
+    N = 1
+    GROUP = 2
+    FH = FW = 1
+    IH = IW = 4
+    OH = OW = 4
+    ICPG = OCPG = 1
+    grad_shape = (N, GROUP * ICPG, IH, IW)
+    src_shape = grad_shape
+    filter_shape = (GROUP, OCPG, ICPG, FH, FW)
+    diff_shape = (N, GROUP * OCPG, OH, OW)
+    rin_shape = (N, IH, IW)
+    rout_shape = (N, OH, OW)
+
+    def reduce(shape):
+        mul = 1
+        for x in shape:
+            mul *= x
+        return mul
+
+    def get_groundtruth():
+        src = tensor(
+            np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
+            device="cpu0",
+        )
+        filter = tensor(np.ones(filter_shape).astype(np.float32), device="cpu0")
+        rin = tensor(np.ones(rin_shape).astype(np.int32), device="cpu0")
+        rout = tensor(np.ones(rout_shape).astype(np.int32), device="cpu0")
+        gm = GradManager().attach([src, filter])
+        with gm:
+            expected_out = F.region_restricted_conv(
+                src, filter, rin, rout, groups=GROUP
+            )
+            gm.backward(
+                expected_out,
+                tensor(np.ones(diff_shape, dtype=np.float32), device="cpu0"),
+            )
+        return src, filter
+
+    expected_src, expected_filter = get_groundtruth()
+
+    # forward and dgrad/wgrad
+    src = tensor(
+        np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
+        device=handle,
+    )
+    filter = tensor(np.ones(filter_shape).astype(np.float32), device=handle)
+    rin = tensor(np.ones(rin_shape).astype(np.uint8), device=handle)
+    rout = tensor(np.ones(rout_shape).astype(np.uint8), device=handle)
+
+    gm = GradManager().attach([src, filter])
+    with gm:
+        gpu_out = F.region_restricted_conv(src, filter, rin, rout, groups=GROUP)
+        gm.backward(
+            gpu_out, tensor(np.ones(diff_shape, dtype=np.float32), device=handle)
+        )
+        # assert uint8 gpu result close to cpu result
+        np.testing.assert_allclose(src.grad, expected_src.grad)
+        np.testing.assert_allclose(filter.grad, expected_filter.grad)
+
+
+def test_region_restricted_conv():
+    test_region_restricted_conv_forward_backward_naive()
+    if is_cuda_available():
+        test_region_restricted_conv_forward_backward_cuda()
+        test_region_restricted_conv_forward_backward_uint8()
+
+
 def test_conv2d_autocast():
     """check amp's result is equal to manually converted result"""
     amp.enabled = True
diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp
index dc622509..03feae08 100644
--- a/imperative/src/impl/ops/convolution.cpp
+++ b/imperative/src/impl/ops/convolution.cpp
@@ -3,9 +3,11 @@
 #include "../blob_manager_impl.h"
 #include "../dnn_op_helper.h"
 #include "../op_trait.h"
+#include "megbrain/common.h"
 #include "megbrain/imperative/ops/autogen.h"
 #include "megbrain/opr/internal/megdnn_opr_wrapper.h"
 #include "megbrain/opr/tensor_gen.h"
+#include "megdnn/oprs/nn.h"
 
 namespace mgb {
 namespace imperative {
@@ -356,5 +358,174 @@ OP_TRAIT_REG(Convolution3DBackwardData, Convolution3DBackwardData)
 }  // namespace convolution3d_backward_data
 }  // namespace
 
+namespace {
+namespace region_restricted_conv {
+std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
+    auto* node = &node_->cast_final_safe<opr::RegionRestrictedConvolution>();
+    return RegionRestrictedConvolution::make(node->param());
+}
+
+auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
+    auto&& conv = static_cast<const RegionRestrictedConvolution&>(def);
+    OperatorNodeConfig config{conv.make_name()};
+    return opr::RegionRestrictedConvolution::make(
+            inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), config);
+}
+
+std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
+        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
+    auto&& region_restricted_conv =
+            def.cast_final_safe<mgb::imperative::RegionRestrictedConvolution>();
+    DnnOprHelper<megdnn::RegionRestrictedConvolutionForward> dnn_opr(
+            region_restricted_conv.param());
+
+    auto&& src = inputs[0].layout;
+    auto&& filter = inputs[1].layout;
+    auto&& rin = inputs[2].layout;
+    auto&& rout = inputs[3].layout;
+    TensorLayout output_layout{src.dtype};
+    if (src.ndim && filter.ndim) {
+        dnn_opr.opr().deduce_layout(src, filter, rin, rout, output_layout);
+    }
+
+    return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
+}
+
+SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def, const SmallVector<TensorPtr>& inputs,
+        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
+    // create megdnn opr
+    auto&& region_restricted_conv = def.cast_final_safe<RegionRestrictedConvolution>();
+    CompNode cn = inputs[0]->comp_node();
+
+    auto&& param = region_restricted_conv.param();
+    DnnOprCaller<megdnn::RegionRestrictedConvolutionForward> dnn_opr(cn, param);
+
+    auto srclo = inputs[0]->layout();
+    auto filterlo = inputs[1]->layout();
+    auto rinlo = inputs[2]->layout();
+    auto routlo = inputs[3]->layout();
+
+    auto out_layout = [&] {
+        if (validated) {
+            return output_descs[0].layout;
+        } else {
+            TensorLayout out_layout{inputs[0]->dtype()};
+            dnn_opr.op()->deduce_layout(srclo, filterlo, rinlo, routlo, out_layout);
+            return out_layout;
+        }
+    }();
+
+    auto out = Tensor::make(out_layout, cn);
+    dnn_opr.exec_with_ws(inputs[0], inputs[1], inputs[2], inputs[3], out);
+    return {out};
+}
+
+OP_TRAIT_REG(
+        RegionRestrictedConvolution, RegionRestrictedConvolution,
+        opr::RegionRestrictedConvolution)
+        .make_from_op_node(make_from_op_node)
+        .apply_on_var_node(apply_on_var_node)
+        .infer_output_attrs_fallible(infer_output_attrs_fallible)
+        .apply_on_physical_tensor(apply_on_physical_tensor)
+        .fallback();
+}  // namespace region_restricted_conv
+}  // namespace
+
+namespace {
+namespace region_restricted_conv_backward_data {
+
+std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
+    auto* node =
+            &node_->cast_final_safe<opr::RegionRestrictedConvolutionBackwardData>();
+    return RegionRestrictedConvolutionBackwardData::make(node->param());
+}
+
+auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
+    auto&& conv = static_cast<const RegionRestrictedConvolutionBackwardData&>(def);
+    OperatorNodeConfig config{conv.make_name()};
+    // output_dtype may infered from input within rrconv bwd data(deduce_dtype api)
+    CompNode cn = inputs[0]->comp_node();
+    DType output_dtype;
+    DnnOprCaller<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(cn);
+    dnn_opr.op()->deduce_dtype(
+            inputs[0]->dtype(), inputs[1]->dtype(), inputs[2]->dtype(),
+            inputs[3]->dtype(), output_dtype);
+    if (output_dtype.valid())
+        config.output_dtype(output_dtype);
+    if (inputs.size() == 4) {
+        return opr::RegionRestrictedConvolutionBackwardData::make(
+                inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), config);
+    } else if (inputs.size() == 5) {
+        return opr::RegionRestrictedConvolutionBackwardData::make(
+                inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], conv.param(),
+                config);
+    }
+    mgb_assert(0);
+}
+
+std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
+        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
+    auto&& convbwd = def.cast_final_safe<
+            mgb::imperative::RegionRestrictedConvolutionBackwardData>();
+    DnnOprHelper<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(
+            convbwd.param());
+
+    TensorLayout filter = inputs[0].layout;
+    TensorLayout diff = inputs[1].layout;
+    TensorLayout rin = inputs[2].layout;
+    TensorLayout rout = inputs[3].layout;
+
+    DType output_dtype;
+    dnn_opr.opr().deduce_dtype(
+            inputs[0].layout.dtype, inputs[1].layout.dtype, inputs[2].layout.dtype,
+            inputs[3].layout.dtype, output_dtype);
+    TensorLayout output_layout{output_dtype};
+    if (diff.ndim && filter.ndim) {
+        dnn_opr.opr().deduce_layout(filter, diff, rin, rout, output_layout);
+    }
+    return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
+}
+
+SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def, const SmallVector<TensorPtr>& inputs,
+        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
+    auto&& convbwd = def.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
+    CompNode cn = inputs[0]->comp_node();
+    DnnOprCaller<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(
+            cn, convbwd.param());
+
+    auto filterlo = inputs[0]->layout();
+    auto difflo = inputs[1]->layout();
+    auto rinlo = inputs[2]->layout();
+    auto routlo = inputs[3]->layout();
+
+    auto out_layout = [&] {
+        if (validated) {
+            return output_descs[0].layout;
+        } else {
+            TensorLayout out_layout{inputs[0]->dtype()};
+            dnn_opr.op()->deduce_layout(filterlo, difflo, rinlo, routlo, out_layout);
+            return out_layout;
+        }
+    }();
+
+    auto out = Tensor::make(out_layout, cn);
+    dnn_opr.exec_with_ws(inputs[0], inputs[1], inputs[2], inputs[3], out);
+    return {out};
+}
+
+OP_TRAIT_REG(
+        RegionRestrictedConvolutionBackwardData,
+        RegionRestrictedConvolutionBackwardData,
+        opr::RegionRestrictedConvolutionBackwardData)
+        .make_from_op_node(make_from_op_node)
+        .apply_on_var_node(apply_on_var_node)
+        .infer_output_attrs_fallible(infer_output_attrs_fallible)
+        .apply_on_physical_tensor(apply_on_physical_tensor)
+        .fallback();
+}  // namespace region_restricted_conv_backward_data
+}  // namespace
+
 }  // namespace imperative
 }  // namespace mgb
diff --git a/imperative/tablegen/generated/hash.txt b/imperative/tablegen/generated/hash.txt
index 35080e89..a5c170f9 100644
--- a/imperative/tablegen/generated/hash.txt
+++ b/imperative/tablegen/generated/hash.txt
@@ -1,7 +1,7 @@
 905bdf78e5413b06873be64b4ba55db9  ../../dnn/scripts/opr_param_defs.py
-40708c56b1f05fdb7d06cc097a300330  ../../src/core/include/megbrain/ir/ops.td
-9f3af118c7fe8d0c9db433825d5ad77b  generated/opdef.h.inl
-4041e44a8ba3cca3b3affa1ed9ed44a2  generated/opdef.cpp.inl
-319e1d170c989fe793a4e9c45decefc4  generated/opdef.py.inl
-26a18a7593566128ecce76e8f74dcc5d  generated/opdef.cpy.inl
+da03ffe2a15411f902cd88920d3d47ec  ../../src/core/include/megbrain/ir/ops.td
+5756619f37e4dc130e1b049d7706d4eb  generated/opdef.h.inl
+98d1291eed73970ee087f898b6241358  generated/opdef.cpp.inl
+b1a9c7569392942294c2168d40939eb5  generated/opdef.py.inl
+3d88d5358d15a39219957f5257e32f5b  generated/opdef.cpy.inl
 71e1462bf4d882e2615c3c632cb671cc  generated/enum_macro.h
diff --git a/imperative/tablegen/generated/opdef.cpp.inl b/imperative/tablegen/generated/opdef.cpp.inl
index 08449b9d..5fdd308a 100644
--- a/imperative/tablegen/generated/opdef.cpp.inl
+++ b/imperative/tablegen/generated/opdef.cpp.inl
@@ -5694,6 +5694,310 @@ OP_TRAIT_REG(Reduce, Reduce)
     .props(Reduce_props_impl)
     .make_name(Reduce_make_name_impl);
 
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(RegionRestrictedConvolution);
+
+namespace {
+size_t RegionRestrictedConvolution_hash_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
+    static_cast<void>(op_);
+    size_t val = mgb::hash(op_.dyn_typeinfo());
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.mode));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_w));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_w));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_w));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.sparse));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.format));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.compute_mode));
+    return val;
+}
+bool RegionRestrictedConvolution_is_same_st_impl(const OpDef& lhs_, const OpDef& rhs_) {
+    auto &&a_ = lhs_.cast_final_safe<RegionRestrictedConvolution>(),
+         &&b_ = rhs_.cast_final_safe<RegionRestrictedConvolution>();
+    static_cast<void>(a_);
+    static_cast<void>(b_);
+    if (a_.mode != b_.mode) return false;
+    if (a_.pad_h != b_.pad_h) return false;
+    if (a_.pad_w != b_.pad_w) return false;
+    if (a_.stride_h != b_.stride_h) return false;
+    if (a_.stride_w != b_.stride_w) return false;
+    if (a_.dilate_h != b_.dilate_h) return false;
+    if (a_.dilate_w != b_.dilate_w) return false;
+    if (a_.sparse != b_.sparse) return false;
+    if (a_.format != b_.format) return false;
+    if (a_.compute_mode != b_.compute_mode) return false;
+    return true;
+}
+std::vector<std::pair<const char*, std::string>> RegionRestrictedConvolution_props_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
+    static_cast<void>(op_);
+    std::vector<std::pair<const char*, std::string>> props_;
+    switch (op_.mode){
+    case RegionRestrictedConvolution::Mode::CROSS_CORRELATION:
+        props_.emplace_back("mode", "CROSS_CORRELATION");
+        break;
+    case RegionRestrictedConvolution::Mode::CONVOLUTION:
+        props_.emplace_back("mode", "CONVOLUTION");
+        break;
+    default:
+        props_.emplace_back("mode", "INVALID");
+        break;
+    }
+    props_.emplace_back("pad_h", std::to_string(op_.pad_h));
+    props_.emplace_back("pad_w", std::to_string(op_.pad_w));
+    props_.emplace_back("stride_h", std::to_string(op_.stride_h));
+    props_.emplace_back("stride_w", std::to_string(op_.stride_w));
+    props_.emplace_back("dilate_h", std::to_string(op_.dilate_h));
+    props_.emplace_back("dilate_w", std::to_string(op_.dilate_w));
+    switch (op_.sparse){
+    case RegionRestrictedConvolution::Sparse::DENSE:
+        props_.emplace_back("sparse", "DENSE");
+        break;
+    case RegionRestrictedConvolution::Sparse::GROUP:
+        props_.emplace_back("sparse", "GROUP");
+        break;
+    default:
+        props_.emplace_back("sparse", "INVALID");
+        break;
+    }
+    switch (op_.format){
+    case RegionRestrictedConvolution::Format::NCHW:
+        props_.emplace_back("format", "NCHW");
+        break;
+    case RegionRestrictedConvolution::Format::NHWC:
+        props_.emplace_back("format", "NHWC");
+        break;
+    case RegionRestrictedConvolution::Format::NHWCD4:
+        props_.emplace_back("format", "NHWCD4");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW4:
+        props_.emplace_back("format", "NCHW4");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW8:
+        props_.emplace_back("format", "NCHW8");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW32:
+        props_.emplace_back("format", "NCHW32");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW88:
+        props_.emplace_back("format", "NCHW88");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW44:
+        props_.emplace_back("format", "NCHW44");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW44_DOT:
+        props_.emplace_back("format", "NCHW44_DOT");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW4_NCHW32:
+        props_.emplace_back("format", "NCHW4_NCHW32");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW32_NCHW4:
+        props_.emplace_back("format", "NCHW32_NCHW4");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW4_NCHW:
+        props_.emplace_back("format", "NCHW4_NCHW");
+        break;
+    case RegionRestrictedConvolution::Format::NHWC_NCHW:
+        props_.emplace_back("format", "NHWC_NCHW");
+        break;
+    case RegionRestrictedConvolution::Format::NHWC_NCHW4_IC_SMALL:
+        props_.emplace_back("format", "NHWC_NCHW4_IC_SMALL");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW_NCHW4_IC_SMALL:
+        props_.emplace_back("format", "NCHW_NCHW4_IC_SMALL");
+        break;
+    case RegionRestrictedConvolution::Format::CHWN4:
+        props_.emplace_back("format", "CHWN4");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW64:
+        props_.emplace_back("format", "NCHW64");
+        break;
+    case RegionRestrictedConvolution::Format::NCHW4_NHWC:
+        props_.emplace_back("format", "NCHW4_NHWC");
+        break;
+    default:
+        props_.emplace_back("format", "INVALID");
+        break;
+    }
+    switch (op_.compute_mode){
+    case RegionRestrictedConvolution::ComputeMode::DEFAULT:
+        props_.emplace_back("compute_mode", "DEFAULT");
+        break;
+    case RegionRestrictedConvolution::ComputeMode::FLOAT32:
+        props_.emplace_back("compute_mode", "FLOAT32");
+        break;
+    default:
+        props_.emplace_back("compute_mode", "INVALID");
+        break;
+    }
+    return props_;
+}
+std::string RegionRestrictedConvolution_make_name_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
+    static_cast<void>(op_);
+    return "RegionRestrictedConvolution";
+}
+} // anonymous namespace
+OP_TRAIT_REG(RegionRestrictedConvolution, RegionRestrictedConvolution)
+    .hash(RegionRestrictedConvolution_hash_impl)
+    .is_same_st(RegionRestrictedConvolution_is_same_st_impl)
+    .props(RegionRestrictedConvolution_props_impl)
+    .make_name(RegionRestrictedConvolution_make_name_impl);
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(RegionRestrictedConvolutionBackwardData);
+
+namespace {
+size_t RegionRestrictedConvolutionBackwardData_hash_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
+    static_cast<void>(op_);
+    size_t val = mgb::hash(op_.dyn_typeinfo());
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.mode));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_w));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_w));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_h));
+    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_w));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.sparse));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.format));
+    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.compute_mode));
+    return val;
+}
+bool RegionRestrictedConvolutionBackwardData_is_same_st_impl(const OpDef& lhs_, const OpDef& rhs_) {
+    auto &&a_ = lhs_.cast_final_safe<RegionRestrictedConvolutionBackwardData>(),
+         &&b_ = rhs_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
+    static_cast<void>(a_);
+    static_cast<void>(b_);
+    if (a_.mode != b_.mode) return false;
+    if (a_.pad_h != b_.pad_h) return false;
+    if (a_.pad_w != b_.pad_w) return false;
+    if (a_.stride_h != b_.stride_h) return false;
+    if (a_.stride_w != b_.stride_w) return false;
+    if (a_.dilate_h != b_.dilate_h) return false;
+    if (a_.dilate_w != b_.dilate_w) return false;
+    if (a_.sparse != b_.sparse) return false;
+    if (a_.format != b_.format) return false;
+    if (a_.compute_mode != b_.compute_mode) return false;
+    return true;
+}
+std::vector<std::pair<const char*, std::string>> RegionRestrictedConvolutionBackwardData_props_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
+    static_cast<void>(op_);
+    std::vector<std::pair<const char*, std::string>> props_;
+    switch (op_.mode){
+    case RegionRestrictedConvolutionBackwardData::Mode::CROSS_CORRELATION:
+        props_.emplace_back("mode", "CROSS_CORRELATION");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Mode::CONVOLUTION:
+        props_.emplace_back("mode", "CONVOLUTION");
+        break;
+    default:
+        props_.emplace_back("mode", "INVALID");
+        break;
+    }
+    props_.emplace_back("pad_h", std::to_string(op_.pad_h));
+    props_.emplace_back("pad_w", std::to_string(op_.pad_w));
+    props_.emplace_back("stride_h", std::to_string(op_.stride_h));
+    props_.emplace_back("stride_w", std::to_string(op_.stride_w));
+    props_.emplace_back("dilate_h", std::to_string(op_.dilate_h));
+    props_.emplace_back("dilate_w", std::to_string(op_.dilate_w));
+    switch (op_.sparse){
+    case RegionRestrictedConvolutionBackwardData::Sparse::DENSE:
+        props_.emplace_back("sparse", "DENSE");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Sparse::GROUP:
+        props_.emplace_back("sparse", "GROUP");
+        break;
+    default:
+        props_.emplace_back("sparse", "INVALID");
+        break;
+    }
+    switch (op_.format){
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW:
+        props_.emplace_back("format", "NCHW");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NHWC:
+        props_.emplace_back("format", "NHWC");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NHWCD4:
+        props_.emplace_back("format", "NHWCD4");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW4:
+        props_.emplace_back("format", "NCHW4");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW8:
+        props_.emplace_back("format", "NCHW8");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW32:
+        props_.emplace_back("format", "NCHW32");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW88:
+        props_.emplace_back("format", "NCHW88");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW44:
+        props_.emplace_back("format", "NCHW44");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW44_DOT:
+        props_.emplace_back("format", "NCHW44_DOT");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NCHW32:
+        props_.emplace_back("format", "NCHW4_NCHW32");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW32_NCHW4:
+        props_.emplace_back("format", "NCHW32_NCHW4");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NCHW:
+        props_.emplace_back("format", "NCHW4_NCHW");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NHWC_NCHW:
+        props_.emplace_back("format", "NHWC_NCHW");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NHWC_NCHW4_IC_SMALL:
+        props_.emplace_back("format", "NHWC_NCHW4_IC_SMALL");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW_NCHW4_IC_SMALL:
+        props_.emplace_back("format", "NCHW_NCHW4_IC_SMALL");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::CHWN4:
+        props_.emplace_back("format", "CHWN4");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW64:
+        props_.emplace_back("format", "NCHW64");
+        break;
+    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NHWC:
+        props_.emplace_back("format", "NCHW4_NHWC");
+        break;
+    default:
+        props_.emplace_back("format", "INVALID");
+        break;
+    }
+    switch (op_.compute_mode){
+    case RegionRestrictedConvolutionBackwardData::ComputeMode::DEFAULT:
+        props_.emplace_back("compute_mode", "DEFAULT");
+        break;
+    case RegionRestrictedConvolutionBackwardData::ComputeMode::FLOAT32:
+        props_.emplace_back("compute_mode", "FLOAT32");
+        break;
+    default:
+        props_.emplace_back("compute_mode", "INVALID");
+        break;
+    }
+    return props_;
+}
+std::string RegionRestrictedConvolutionBackwardData_make_name_impl(const OpDef& def_) {
+    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
+    static_cast<void>(op_);
+    return "RegionRestrictedConvolutionBackwardData";
+}
+} // anonymous namespace
+OP_TRAIT_REG(RegionRestrictedConvolutionBackwardData, RegionRestrictedConvolutionBackwardData)
+    .hash(RegionRestrictedConvolutionBackwardData_hash_impl)
+    .is_same_st(RegionRestrictedConvolutionBackwardData_is_same_st_impl)
+    .props(RegionRestrictedConvolutionBackwardData_props_impl)
+    .make_name(RegionRestrictedConvolutionBackwardData_make_name_impl);
+
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(Remap);
 
 namespace {
diff --git a/imperative/tablegen/generated/opdef.cpy.inl b/imperative/tablegen/generated/opdef.cpy.inl
index 070dc1f2..35f24bad 100644
--- a/imperative/tablegen/generated/opdef.cpy.inl
+++ b/imperative/tablegen/generated/opdef.cpy.inl
@@ -15368,6 +15368,580 @@ void _init_py_Reduce(py::module m) {
     mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(Reduce::typeinfo(), &py_type).second);
 }
 
+void _init_py_RegionRestrictedConvolution_Mode(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Mode>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Mode", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolution_Sparse(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Sparse>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Sparse", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolution_Format(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Format>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Format", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolution_ComputeMode(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolution::ComputeMode>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "ComputeMode", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+PyOpDefBegin(RegionRestrictedConvolution) // {
+    static PyGetSetDef py_getsetters[];
+    static PyMethodDef tp_methods[];
+    
+    static PyObject* getstate(PyObject* self, PyObject*) {
+        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst();
+        static_cast<void>(opdef);
+        std::unordered_map<std::string, py::object> state {
+            
+            {"mode", serialization<decltype(opdef.mode)>::dump(opdef.mode)},
+            {"pad_h", serialization<decltype(opdef.pad_h)>::dump(opdef.pad_h)},
+            {"pad_w", serialization<decltype(opdef.pad_w)>::dump(opdef.pad_w)},
+            {"stride_h", serialization<decltype(opdef.stride_h)>::dump(opdef.stride_h)},
+            {"stride_w", serialization<decltype(opdef.stride_w)>::dump(opdef.stride_w)},
+            {"dilate_h", serialization<decltype(opdef.dilate_h)>::dump(opdef.dilate_h)},
+            {"dilate_w", serialization<decltype(opdef.dilate_w)>::dump(opdef.dilate_w)},
+            {"sparse", serialization<decltype(opdef.sparse)>::dump(opdef.sparse)},
+            {"format", serialization<decltype(opdef.format)>::dump(opdef.format)},
+            {"compute_mode", serialization<decltype(opdef.compute_mode)>::dump(opdef.compute_mode)}
+        };
+        return py::cast(state).release().ptr();
+    }
+    static PyObject* setstate(PyObject* self, PyObject* args) {
+        PyObject* dict = PyTuple_GetItem(args, 0);
+        if (!dict) return NULL;
+        auto state = py::cast<std::unordered_map<std::string, py::object>>(dict);
+        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst();
+        static_cast<void>(opdef);
+        
+        {
+        auto&& iter = state.find("mode");
+        if (iter != state.end()) {
+            opdef.mode = serialization<decltype(opdef.mode)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("pad_h");
+        if (iter != state.end()) {
+            opdef.pad_h = serialization<decltype(opdef.pad_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("pad_w");
+        if (iter != state.end()) {
+            opdef.pad_w = serialization<decltype(opdef.pad_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("stride_h");
+        if (iter != state.end()) {
+            opdef.stride_h = serialization<decltype(opdef.stride_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("stride_w");
+        if (iter != state.end()) {
+            opdef.stride_w = serialization<decltype(opdef.stride_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("dilate_h");
+        if (iter != state.end()) {
+            opdef.dilate_h = serialization<decltype(opdef.dilate_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("dilate_w");
+        if (iter != state.end()) {
+            opdef.dilate_w = serialization<decltype(opdef.dilate_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("sparse");
+        if (iter != state.end()) {
+            opdef.sparse = serialization<decltype(opdef.sparse)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("format");
+        if (iter != state.end()) {
+            opdef.format = serialization<decltype(opdef.format)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("compute_mode");
+        if (iter != state.end()) {
+            opdef.compute_mode = serialization<decltype(opdef.compute_mode)>::load(iter->second);
+        }
+        }
+        Py_RETURN_NONE;
+    }
+    static int py_init(PyObject *self, PyObject *args, PyObject *kwds);
+// };
+PyOpDefEnd(RegionRestrictedConvolution)
+
+int PyOp(RegionRestrictedConvolution)::py_init(PyObject *self, PyObject *args, PyObject *kwds) {
+    static const char* kwlist[] = {"mode", "pad_h", "pad_w", "stride_h", "stride_w", "dilate_h", "dilate_w", "sparse", "format", "compute_mode", "scope", NULL};
+    PyObject *mode = NULL, *pad_h = NULL, *pad_w = NULL, *stride_h = NULL, *stride_w = NULL, *dilate_h = NULL, *dilate_w = NULL, *sparse = NULL, *format = NULL, *compute_mode = NULL, *scope = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOOOOOOOOOO", const_cast<char**>(kwlist), &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilate_h, &dilate_w, &sparse, &format, &compute_mode, &scope))
+    return -1;
+
+    if (mode) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().mode =
+                    py::cast<decltype(RegionRestrictedConvolution::mode)>(py::handle(mode));
+        } CATCH_ALL(-1)
+    }
+
+    if (pad_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().pad_h =
+                    py::cast<decltype(RegionRestrictedConvolution::pad_h)>(py::handle(pad_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (pad_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().pad_w =
+                    py::cast<decltype(RegionRestrictedConvolution::pad_w)>(py::handle(pad_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (stride_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().stride_h =
+                    py::cast<decltype(RegionRestrictedConvolution::stride_h)>(py::handle(stride_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (stride_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().stride_w =
+                    py::cast<decltype(RegionRestrictedConvolution::stride_w)>(py::handle(stride_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (dilate_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().dilate_h =
+                    py::cast<decltype(RegionRestrictedConvolution::dilate_h)>(py::handle(dilate_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (dilate_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().dilate_w =
+                    py::cast<decltype(RegionRestrictedConvolution::dilate_w)>(py::handle(dilate_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (sparse) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().sparse =
+                    py::cast<decltype(RegionRestrictedConvolution::sparse)>(py::handle(sparse));
+        } CATCH_ALL(-1)
+    }
+
+    if (format) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().format =
+                    py::cast<decltype(RegionRestrictedConvolution::format)>(py::handle(format));
+        } CATCH_ALL(-1)
+    }
+
+    if (compute_mode) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().compute_mode =
+                    py::cast<decltype(RegionRestrictedConvolution::compute_mode)>(py::handle(compute_mode));
+        } CATCH_ALL(-1)
+    }
+
+    if (scope) {
+        try {
+            reinterpret_cast<PyOp(OpDef)*>(self)->op
+                ->set_scope(py::cast<std::string>(py::handle(scope)));
+        } CATCH_ALL(-1)
+    }
+
+    return 0;
+}
+
+PyGetSetDef PyOp(RegionRestrictedConvolution)::py_getsetters[] = {
+    {const_cast<char*>("mode"), py_get_generic(RegionRestrictedConvolution, mode), py_set_generic(RegionRestrictedConvolution, mode), const_cast<char*>("mode"), NULL},
+    {const_cast<char*>("pad_h"), py_get_generic(RegionRestrictedConvolution, pad_h), py_set_generic(RegionRestrictedConvolution, pad_h), const_cast<char*>("pad_h"), NULL},
+    {const_cast<char*>("pad_w"), py_get_generic(RegionRestrictedConvolution, pad_w), py_set_generic(RegionRestrictedConvolution, pad_w), const_cast<char*>("pad_w"), NULL},
+    {const_cast<char*>("stride_h"), py_get_generic(RegionRestrictedConvolution, stride_h), py_set_generic(RegionRestrictedConvolution, stride_h), const_cast<char*>("stride_h"), NULL},
+    {const_cast<char*>("stride_w"), py_get_generic(RegionRestrictedConvolution, stride_w), py_set_generic(RegionRestrictedConvolution, stride_w), const_cast<char*>("stride_w"), NULL},
+    {const_cast<char*>("dilate_h"), py_get_generic(RegionRestrictedConvolution, dilate_h), py_set_generic(RegionRestrictedConvolution, dilate_h), const_cast<char*>("dilate_h"), NULL},
+    {const_cast<char*>("dilate_w"), py_get_generic(RegionRestrictedConvolution, dilate_w), py_set_generic(RegionRestrictedConvolution, dilate_w), const_cast<char*>("dilate_w"), NULL},
+    {const_cast<char*>("sparse"), py_get_generic(RegionRestrictedConvolution, sparse), py_set_generic(RegionRestrictedConvolution, sparse), const_cast<char*>("sparse"), NULL},
+    {const_cast<char*>("format"), py_get_generic(RegionRestrictedConvolution, format), py_set_generic(RegionRestrictedConvolution, format), const_cast<char*>("format"), NULL},
+    {const_cast<char*>("compute_mode"), py_get_generic(RegionRestrictedConvolution, compute_mode), py_set_generic(RegionRestrictedConvolution, compute_mode), const_cast<char*>("compute_mode"), NULL},
+    {NULL}  /* Sentinel */
+};
+
+    PyMethodDef PyOp(RegionRestrictedConvolution)::tp_methods[] = {
+        {const_cast<char*>("__getstate__"), PyOp(RegionRestrictedConvolution)::getstate, METH_NOARGS, "RegionRestrictedConvolution getstate"},
+    {const_cast<char*>("__setstate__"), PyOp(RegionRestrictedConvolution)::setstate, METH_VARARGS, "RegionRestrictedConvolution setstate"},
+        {NULL}  /* Sentinel */
+    };
+    
+void _init_py_RegionRestrictedConvolution(py::module m) {
+    using py_op = PyOp(RegionRestrictedConvolution);
+    auto& py_type = PyOpType(RegionRestrictedConvolution);
+    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
+    py_type.tp_name = "megengine.core._imperative_rt.ops.RegionRestrictedConvolution";
+    py_type.tp_basicsize = sizeof(PyOp(RegionRestrictedConvolution));
+    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    py_type.tp_doc = "RegionRestrictedConvolution";
+    py_type.tp_base = &PyOpType(OpDef);
+    py_type.tp_dealloc = py_dealloc_generic<py_op>;
+    py_type.tp_new = py_new_generic<py_op>;
+    py_type.tp_init = py_op::py_init;
+    py_type.tp_methods = py_op::tp_methods;
+    py_type.tp_getset = py_op::py_getsetters;
+    mgb_assert(PyType_Ready(&py_type) >= 0);
+        _init_py_RegionRestrictedConvolution_Mode(py_type);
+    _init_py_RegionRestrictedConvolution_Sparse(py_type);
+    _init_py_RegionRestrictedConvolution_Format(py_type);
+    _init_py_RegionRestrictedConvolution_ComputeMode(py_type);
+
+    PyType_Modified(&py_type);
+    m.add_object("RegionRestrictedConvolution", reinterpret_cast<PyObject*>(&py_type));
+    mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(RegionRestrictedConvolution::typeinfo(), &py_type).second);
+}
+
+void _init_py_RegionRestrictedConvolutionBackwardData_Mode(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Mode>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Mode", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolutionBackwardData_Sparse(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Sparse>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Sparse", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolutionBackwardData_Format(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Format>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "Format", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+void _init_py_RegionRestrictedConvolutionBackwardData_ComputeMode(PyTypeObject& py_type) {
+    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::ComputeMode>::type;
+
+    Py_INCREF(e_type);
+    mgb_assert(PyDict_SetItemString(
+        py_type.tp_dict, "ComputeMode", reinterpret_cast<PyObject*>(e_type)) >= 0);
+}
+
+PyOpDefBegin(RegionRestrictedConvolutionBackwardData) // {
+    static PyGetSetDef py_getsetters[];
+    static PyMethodDef tp_methods[];
+    
+    static PyObject* getstate(PyObject* self, PyObject*) {
+        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst();
+        static_cast<void>(opdef);
+        std::unordered_map<std::string, py::object> state {
+            
+            {"mode", serialization<decltype(opdef.mode)>::dump(opdef.mode)},
+            {"pad_h", serialization<decltype(opdef.pad_h)>::dump(opdef.pad_h)},
+            {"pad_w", serialization<decltype(opdef.pad_w)>::dump(opdef.pad_w)},
+            {"stride_h", serialization<decltype(opdef.stride_h)>::dump(opdef.stride_h)},
+            {"stride_w", serialization<decltype(opdef.stride_w)>::dump(opdef.stride_w)},
+            {"dilate_h", serialization<decltype(opdef.dilate_h)>::dump(opdef.dilate_h)},
+            {"dilate_w", serialization<decltype(opdef.dilate_w)>::dump(opdef.dilate_w)},
+            {"sparse", serialization<decltype(opdef.sparse)>::dump(opdef.sparse)},
+            {"format", serialization<decltype(opdef.format)>::dump(opdef.format)},
+            {"compute_mode", serialization<decltype(opdef.compute_mode)>::dump(opdef.compute_mode)}
+        };
+        return py::cast(state).release().ptr();
+    }
+    static PyObject* setstate(PyObject* self, PyObject* args) {
+        PyObject* dict = PyTuple_GetItem(args, 0);
+        if (!dict) return NULL;
+        auto state = py::cast<std::unordered_map<std::string, py::object>>(dict);
+        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst();
+        static_cast<void>(opdef);
+        
+        {
+        auto&& iter = state.find("mode");
+        if (iter != state.end()) {
+            opdef.mode = serialization<decltype(opdef.mode)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("pad_h");
+        if (iter != state.end()) {
+            opdef.pad_h = serialization<decltype(opdef.pad_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("pad_w");
+        if (iter != state.end()) {
+            opdef.pad_w = serialization<decltype(opdef.pad_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("stride_h");
+        if (iter != state.end()) {
+            opdef.stride_h = serialization<decltype(opdef.stride_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("stride_w");
+        if (iter != state.end()) {
+            opdef.stride_w = serialization<decltype(opdef.stride_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("dilate_h");
+        if (iter != state.end()) {
+            opdef.dilate_h = serialization<decltype(opdef.dilate_h)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("dilate_w");
+        if (iter != state.end()) {
+            opdef.dilate_w = serialization<decltype(opdef.dilate_w)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("sparse");
+        if (iter != state.end()) {
+            opdef.sparse = serialization<decltype(opdef.sparse)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("format");
+        if (iter != state.end()) {
+            opdef.format = serialization<decltype(opdef.format)>::load(iter->second);
+        }
+        }
+
+        {
+        auto&& iter = state.find("compute_mode");
+        if (iter != state.end()) {
+            opdef.compute_mode = serialization<decltype(opdef.compute_mode)>::load(iter->second);
+        }
+        }
+        Py_RETURN_NONE;
+    }
+    static int py_init(PyObject *self, PyObject *args, PyObject *kwds);
+// };
+PyOpDefEnd(RegionRestrictedConvolutionBackwardData)
+
+int PyOp(RegionRestrictedConvolutionBackwardData)::py_init(PyObject *self, PyObject *args, PyObject *kwds) {
+    static const char* kwlist[] = {"mode", "pad_h", "pad_w", "stride_h", "stride_w", "dilate_h", "dilate_w", "sparse", "format", "compute_mode", "scope", NULL};
+    PyObject *mode = NULL, *pad_h = NULL, *pad_w = NULL, *stride_h = NULL, *stride_w = NULL, *dilate_h = NULL, *dilate_w = NULL, *sparse = NULL, *format = NULL, *compute_mode = NULL, *scope = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOOOOOOOOOO", const_cast<char**>(kwlist), &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilate_h, &dilate_w, &sparse, &format, &compute_mode, &scope))
+    return -1;
+
+    if (mode) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().mode =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::mode)>(py::handle(mode));
+        } CATCH_ALL(-1)
+    }
+
+    if (pad_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().pad_h =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::pad_h)>(py::handle(pad_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (pad_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().pad_w =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::pad_w)>(py::handle(pad_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (stride_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().stride_h =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::stride_h)>(py::handle(stride_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (stride_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().stride_w =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::stride_w)>(py::handle(stride_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (dilate_h) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().dilate_h =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::dilate_h)>(py::handle(dilate_h));
+        } CATCH_ALL(-1)
+    }
+
+    if (dilate_w) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().dilate_w =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::dilate_w)>(py::handle(dilate_w));
+        } CATCH_ALL(-1)
+    }
+
+    if (sparse) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().sparse =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::sparse)>(py::handle(sparse));
+        } CATCH_ALL(-1)
+    }
+
+    if (format) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().format =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::format)>(py::handle(format));
+        } CATCH_ALL(-1)
+    }
+
+    if (compute_mode) {
+        try {
+            // TODO: remove this guard which is used for pybind11 implicit conversion
+            py::detail::loader_life_support guard{};
+            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().compute_mode =
+                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::compute_mode)>(py::handle(compute_mode));
+        } CATCH_ALL(-1)
+    }
+
+    if (scope) {
+        try {
+            reinterpret_cast<PyOp(OpDef)*>(self)->op
+                ->set_scope(py::cast<std::string>(py::handle(scope)));
+        } CATCH_ALL(-1)
+    }
+
+    return 0;
+}
+
+PyGetSetDef PyOp(RegionRestrictedConvolutionBackwardData)::py_getsetters[] = {
+    {const_cast<char*>("mode"), py_get_generic(RegionRestrictedConvolutionBackwardData, mode), py_set_generic(RegionRestrictedConvolutionBackwardData, mode), const_cast<char*>("mode"), NULL},
+    {const_cast<char*>("pad_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, pad_h), py_set_generic(RegionRestrictedConvolutionBackwardData, pad_h), const_cast<char*>("pad_h"), NULL},
+    {const_cast<char*>("pad_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, pad_w), py_set_generic(RegionRestrictedConvolutionBackwardData, pad_w), const_cast<char*>("pad_w"), NULL},
+    {const_cast<char*>("stride_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, stride_h), py_set_generic(RegionRestrictedConvolutionBackwardData, stride_h), const_cast<char*>("stride_h"), NULL},
+    {const_cast<char*>("stride_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, stride_w), py_set_generic(RegionRestrictedConvolutionBackwardData, stride_w), const_cast<char*>("stride_w"), NULL},
+    {const_cast<char*>("dilate_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, dilate_h), py_set_generic(RegionRestrictedConvolutionBackwardData, dilate_h), const_cast<char*>("dilate_h"), NULL},
+    {const_cast<char*>("dilate_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, dilate_w), py_set_generic(RegionRestrictedConvolutionBackwardData, dilate_w), const_cast<char*>("dilate_w"), NULL},
+    {const_cast<char*>("sparse"), py_get_generic(RegionRestrictedConvolutionBackwardData, sparse), py_set_generic(RegionRestrictedConvolutionBackwardData, sparse), const_cast<char*>("sparse"), NULL},
+    {const_cast<char*>("format"), py_get_generic(RegionRestrictedConvolutionBackwardData, format), py_set_generic(RegionRestrictedConvolutionBackwardData, format), const_cast<char*>("format"), NULL},
+    {const_cast<char*>("compute_mode"), py_get_generic(RegionRestrictedConvolutionBackwardData, compute_mode), py_set_generic(RegionRestrictedConvolutionBackwardData, compute_mode), const_cast<char*>("compute_mode"), NULL},
+    {NULL}  /* Sentinel */
+};
+
+    PyMethodDef PyOp(RegionRestrictedConvolutionBackwardData)::tp_methods[] = {
+        {const_cast<char*>("__getstate__"), PyOp(RegionRestrictedConvolutionBackwardData)::getstate, METH_NOARGS, "RegionRestrictedConvolutionBackwardData getstate"},
+    {const_cast<char*>("__setstate__"), PyOp(RegionRestrictedConvolutionBackwardData)::setstate, METH_VARARGS, "RegionRestrictedConvolutionBackwardData setstate"},
+        {NULL}  /* Sentinel */
+    };
+    
+void _init_py_RegionRestrictedConvolutionBackwardData(py::module m) {
+    using py_op = PyOp(RegionRestrictedConvolutionBackwardData);
+    auto& py_type = PyOpType(RegionRestrictedConvolutionBackwardData);
+    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
+    py_type.tp_name = "megengine.core._imperative_rt.ops.RegionRestrictedConvolutionBackwardData";
+    py_type.tp_basicsize = sizeof(PyOp(RegionRestrictedConvolutionBackwardData));
+    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    py_type.tp_doc = "RegionRestrictedConvolutionBackwardData";
+    py_type.tp_base = &PyOpType(OpDef);
+    py_type.tp_dealloc = py_dealloc_generic<py_op>;
+    py_type.tp_new = py_new_generic<py_op>;
+    py_type.tp_init = py_op::py_init;
+    py_type.tp_methods = py_op::tp_methods;
+    py_type.tp_getset = py_op::py_getsetters;
+    mgb_assert(PyType_Ready(&py_type) >= 0);
+        _init_py_RegionRestrictedConvolutionBackwardData_Mode(py_type);
+    _init_py_RegionRestrictedConvolutionBackwardData_Sparse(py_type);
+    _init_py_RegionRestrictedConvolutionBackwardData_Format(py_type);
+    _init_py_RegionRestrictedConvolutionBackwardData_ComputeMode(py_type);
+
+    PyType_Modified(&py_type);
+    m.add_object("RegionRestrictedConvolutionBackwardData", reinterpret_cast<PyObject*>(&py_type));
+    mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(RegionRestrictedConvolutionBackwardData::typeinfo(), &py_type).second);
+}
+
 template<> struct EnumTrait<Remap::InterpolationMode> {
     static constexpr const char *name = "Remap.InterpolationMode";
     static constexpr std::underlying_type_t<Remap::InterpolationMode> max = 5 - 1;
@@ -18700,6 +19274,8 @@ void _init_py_WarpPerspectiveBackwardMat(py::module m) {
     _init_py_ROIAlign(m); \
     _init_py_ROIPooling(m); \
     _init_py_Reduce(m); \
+    _init_py_RegionRestrictedConvolution(m); \
+    _init_py_RegionRestrictedConvolutionBackwardData(m); \
     _init_py_Remap(m); \
     _init_py_RemoteRecv(m); \
     _init_py_RemoteSend(m); \
diff --git a/imperative/tablegen/generated/opdef.h.inl b/imperative/tablegen/generated/opdef.h.inl
index 294a6b29..32493046 100644
--- a/imperative/tablegen/generated/opdef.h.inl
+++ b/imperative/tablegen/generated/opdef.h.inl
@@ -1517,6 +1517,58 @@ public:
     }
 };
 
+class RegionRestrictedConvolution : public OpDefImplBase<RegionRestrictedConvolution> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    using Mode = ::megdnn::param::Convolution::Mode;
+    using Sparse = ::megdnn::param::Convolution::Sparse;
+    using Format = ::megdnn::param::Convolution::Format;
+    using ComputeMode = ::megdnn::param::Convolution::ComputeMode;
+    Mode mode = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION;
+    uint32_t pad_h = 0;
+    uint32_t pad_w = 0;
+    uint32_t stride_h = 1;
+    uint32_t stride_w = 1;
+    uint32_t dilate_h = 1;
+    uint32_t dilate_w = 1;
+    Sparse sparse = ::megdnn::param::Convolution::Sparse::DENSE;
+    Format format = ::megdnn::param::Convolution::Format::NCHW;
+    ComputeMode compute_mode = ::megdnn::param::Convolution::ComputeMode::DEFAULT;
+    RegionRestrictedConvolution() = default;
+    RegionRestrictedConvolution(Mode mode_, uint32_t pad_h_, uint32_t pad_w_, uint32_t stride_h_, uint32_t stride_w_, uint32_t dilate_h_, uint32_t dilate_w_, Sparse sparse_, Format format_, ComputeMode compute_mode_, std::string scope_ = {}): mode(mode_), pad_h(pad_h_), pad_w(pad_w_), stride_h(stride_h_), stride_w(stride_w_), dilate_h(dilate_h_), dilate_w(dilate_w_), sparse(sparse_), format(format_), compute_mode(compute_mode_) { set_scope(scope_); }
+    RegionRestrictedConvolution(::megdnn::param::Convolution packed_param_0): mode(packed_param_0.mode), pad_h(packed_param_0.pad_h), pad_w(packed_param_0.pad_w), stride_h(packed_param_0.stride_h), stride_w(packed_param_0.stride_w), dilate_h(packed_param_0.dilate_h), dilate_w(packed_param_0.dilate_w), sparse(packed_param_0.sparse), format(packed_param_0.format), compute_mode(packed_param_0.compute_mode) {}
+    ::megdnn::param::Convolution param() const {
+        return {mode, pad_h, pad_w, stride_h, stride_w, dilate_h, dilate_w, sparse, format, compute_mode};
+    }
+};
+
+class RegionRestrictedConvolutionBackwardData : public OpDefImplBase<RegionRestrictedConvolutionBackwardData> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    using Mode = ::megdnn::param::Convolution::Mode;
+    using Sparse = ::megdnn::param::Convolution::Sparse;
+    using Format = ::megdnn::param::Convolution::Format;
+    using ComputeMode = ::megdnn::param::Convolution::ComputeMode;
+    Mode mode = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION;
+    uint32_t pad_h = 0;
+    uint32_t pad_w = 0;
+    uint32_t stride_h = 1;
+    uint32_t stride_w = 1;
+    uint32_t dilate_h = 1;
+    uint32_t dilate_w = 1;
+    Sparse sparse = ::megdnn::param::Convolution::Sparse::DENSE;
+    Format format = ::megdnn::param::Convolution::Format::NCHW;
+    ComputeMode compute_mode = ::megdnn::param::Convolution::ComputeMode::DEFAULT;
+    RegionRestrictedConvolutionBackwardData() = default;
+    RegionRestrictedConvolutionBackwardData(Mode mode_, uint32_t pad_h_, uint32_t pad_w_, uint32_t stride_h_, uint32_t stride_w_, uint32_t dilate_h_, uint32_t dilate_w_, Sparse sparse_, Format format_, ComputeMode compute_mode_, std::string scope_ = {}): mode(mode_), pad_h(pad_h_), pad_w(pad_w_), stride_h(stride_h_), stride_w(stride_w_), dilate_h(dilate_h_), dilate_w(dilate_w_), sparse(sparse_), format(format_), compute_mode(compute_mode_) { set_scope(scope_); }
+    RegionRestrictedConvolutionBackwardData(::megdnn::param::Convolution packed_param_0): mode(packed_param_0.mode), pad_h(packed_param_0.pad_h), pad_w(packed_param_0.pad_w), stride_h(packed_param_0.stride_h), stride_w(packed_param_0.stride_w), dilate_h(packed_param_0.dilate_h), dilate_w(packed_param_0.dilate_w), sparse(packed_param_0.sparse), format(packed_param_0.format), compute_mode(packed_param_0.compute_mode) {}
+    ::megdnn::param::Convolution param() const {
+        return {mode, pad_h, pad_w, stride_h, stride_w, dilate_h, dilate_w, sparse, format, compute_mode};
+    }
+};
+
 class Remap : public OpDefImplBase<Remap> {
     MGB_DYN_TYPE_OBJ_FINAL_DECL;
 
diff --git a/imperative/tablegen/generated/opdef.py.inl b/imperative/tablegen/generated/opdef.py.inl
index 78115700..f93be1bd 100644
--- a/imperative/tablegen/generated/opdef.py.inl
+++ b/imperative/tablegen/generated/opdef.py.inl
@@ -1620,6 +1620,52 @@ ReduceInst
     .def_readwrite("data_type", &Reduce::data_type)
     .def_readwrite("keepdim", &Reduce::keepdim);
 
+py::class_<RegionRestrictedConvolution, std::shared_ptr<RegionRestrictedConvolution>, OpDef> RegionRestrictedConvolutionInst(m, "RegionRestrictedConvolution");
+
+RegionRestrictedConvolutionInst.attr("Mode") = BatchConvBiasInst.attr("Mode");
+
+RegionRestrictedConvolutionInst.attr("Sparse") = BatchConvBiasInst.attr("Sparse");
+
+RegionRestrictedConvolutionInst.attr("Format") = AdaptivePoolingInst.attr("Format");
+
+RegionRestrictedConvolutionInst.attr("ComputeMode") = BatchConvBiasInst.attr("ComputeMode");
+
+RegionRestrictedConvolutionInst
+    .def(py::init<::megdnn::param::Convolution::Mode, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, ::megdnn::param::Convolution::Sparse, ::megdnn::param::Convolution::Format, ::megdnn::param::Convolution::ComputeMode, std::string>(), py::arg("mode") = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION, py::arg("pad_h") = 0, py::arg("pad_w") = 0, py::arg("stride_h") = 1, py::arg("stride_w") = 1, py::arg("dilate_h") = 1, py::arg("dilate_w") = 1, py::arg("sparse") = ::megdnn::param::Convolution::Sparse::DENSE, py::arg("format") = ::megdnn::param::Convolution::Format::NCHW, py::arg("compute_mode") = ::megdnn::param::Convolution::ComputeMode::DEFAULT, py::arg("scope") = {})
+    .def_readwrite("mode", &RegionRestrictedConvolution::mode)
+    .def_readwrite("pad_h", &RegionRestrictedConvolution::pad_h)
+    .def_readwrite("pad_w", &RegionRestrictedConvolution::pad_w)
+    .def_readwrite("stride_h", &RegionRestrictedConvolution::stride_h)
+    .def_readwrite("stride_w", &RegionRestrictedConvolution::stride_w)
+    .def_readwrite("dilate_h", &RegionRestrictedConvolution::dilate_h)
+    .def_readwrite("dilate_w", &RegionRestrictedConvolution::dilate_w)
+    .def_readwrite("sparse", &RegionRestrictedConvolution::sparse)
+    .def_readwrite("format", &RegionRestrictedConvolution::format)
+    .def_readwrite("compute_mode", &RegionRestrictedConvolution::compute_mode);
+
+py::class_<RegionRestrictedConvolutionBackwardData, std::shared_ptr<RegionRestrictedConvolutionBackwardData>, OpDef> RegionRestrictedConvolutionBackwardDataInst(m, "RegionRestrictedConvolutionBackwardData");
+
+RegionRestrictedConvolutionBackwardDataInst.attr("Mode") = BatchConvBiasInst.attr("Mode");
+
+RegionRestrictedConvolutionBackwardDataInst.attr("Sparse") = BatchConvBiasInst.attr("Sparse");
+
+RegionRestrictedConvolutionBackwardDataInst.attr("Format") = AdaptivePoolingInst.attr("Format");
+
+RegionRestrictedConvolutionBackwardDataInst.attr("ComputeMode") = BatchConvBiasInst.attr("ComputeMode");
+
+RegionRestrictedConvolutionBackwardDataInst
+    .def(py::init<::megdnn::param::Convolution::Mode, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, ::megdnn::param::Convolution::Sparse, ::megdnn::param::Convolution::Format, ::megdnn::param::Convolution::ComputeMode, std::string>(), py::arg("mode") = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION, py::arg("pad_h") = 0, py::arg("pad_w") = 0, py::arg("stride_h") = 1, py::arg("stride_w") = 1, py::arg("dilate_h") = 1, py::arg("dilate_w") = 1, py::arg("sparse") = ::megdnn::param::Convolution::Sparse::DENSE, py::arg("format") = ::megdnn::param::Convolution::Format::NCHW, py::arg("compute_mode") = ::megdnn::param::Convolution::ComputeMode::DEFAULT, py::arg("scope") = {})
+    .def_readwrite("mode", &RegionRestrictedConvolutionBackwardData::mode)
+    .def_readwrite("pad_h", &RegionRestrictedConvolutionBackwardData::pad_h)
+    .def_readwrite("pad_w", &RegionRestrictedConvolutionBackwardData::pad_w)
+    .def_readwrite("stride_h", &RegionRestrictedConvolutionBackwardData::stride_h)
+    .def_readwrite("stride_w", &RegionRestrictedConvolutionBackwardData::stride_w)
+    .def_readwrite("dilate_h", &RegionRestrictedConvolutionBackwardData::dilate_h)
+    .def_readwrite("dilate_w", &RegionRestrictedConvolutionBackwardData::dilate_w)
+    .def_readwrite("sparse", &RegionRestrictedConvolutionBackwardData::sparse)
+    .def_readwrite("format", &RegionRestrictedConvolutionBackwardData::format)
+    .def_readwrite("compute_mode", &RegionRestrictedConvolutionBackwardData::compute_mode);
+
 py::class_<Remap, std::shared_ptr<Remap>, OpDef> RemapInst(m, "Remap");
 
 py::enum_<Remap::InterpolationMode>(RemapInst, "InterpolationMode")
diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td
index ac696807..ecf403e0 100644
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -520,4 +520,9 @@ def MeshGrid: MgbHashableOp<"MeshGrid"> {
   MgbStringAttr:$indexing
   );
 }
+
+def RegionRestrictedConvolution: MgbHashableOp<"RegionRestrictedConvolution", [ConvolutionParam]>;
+
+def RegionRestrictedConvolutionBackwardData: MgbHashableOp<"RegionRestrictedConvolutionBackwardData", [ConvolutionParam]>;
+
 #endif // MGB_OPS
diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp
index e2137d26..13a858ba 100644
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -25,6 +25,58 @@ using namespace cg::static_infer;
 using intl::WorkspaceLimitGetter;
 
 /* ==================== misc impl  ==================== */
+template <typename MGBOPR, typename DNNOPR>
+void mixin::RegionConvBackwardDataMixin::init_output_static_infer_desc_for_bwd_data(
+        cg::OperatorNodeBase* self) {
+    using namespace cg::static_infer;
+    auto&& mgr = self->owner_graph()->static_infer_manager();
+
+    DepVal inp_deps;
+    inp_deps.reserve(6);
+    for (int i = 0; i < 4; i++) {
+        inp_deps.push_back({self->input(i), DepType::SHAPE});
+    }
+
+    auto infer_shp = [self](TensorShape& dest, const InpVal& inp) {
+        TensorLayout ol{self->output(0)->dtype()};
+        mgb_assert(
+                self->input(0)->dtype().category() == DTypeCategory::FLOAT &&
+                        self->input(1)->dtype().category() == DTypeCategory::FLOAT &&
+                        self->input(2)->dtype().category() == DTypeCategory::INT &&
+                        self->input(3)->dtype().category() == DTypeCategory::INT,
+                "region conv dtype assert error!");
+        static_cast<MGBOPR*>(self)->megdnn_opr()->deduce_layout(
+                {inp.val.at(0).shape(), self->input(0)->dtype()},  // filter
+                {inp.val.at(1).shape(), self->input(1)->dtype()},  // diff
+                {inp.val.at(2).shape(), self->input(2)->dtype()},  // rin
+                {inp.val.at(3).shape(), self->input(3)->dtype()},  // rout
+                ol                                                 // grad
+        );
+        dest = ol;
+        return true;
+    };
+    mgr.register_shape_infer(self->output(0), {SourceType::DEP, inp_deps, infer_shp});
+
+    // workspace size
+    auto infer_wk = [self](TensorShape& dest, const InpVal& inp) {
+        TensorLayout ol{self->output(0)->dtype()};
+        dest.ndim = 1;
+        dest.shape[0] =
+                static_cast<MGBOPR*>(self)->megdnn_opr()->get_workspace_in_bytes(
+                        {self->input(0)->shape(), self->input(0)->dtype()},  // filter
+                        {self->input(1)->shape(), self->input(1)->dtype()},  // diff
+                        {self->input(2)->shape(), self->input(2)->dtype()},  // rin
+                        {self->input(3)->shape(), self->input(3)->dtype()},  // rout
+                        ol);
+        return true;
+    };
+    inp_deps.push_back({self->output(0), DepType::SHAPE});
+    auto workspace_dep_var =
+            intl::WorkspaceLimitGetter::register_to_graph(self->owner_graph());
+    if (workspace_dep_var)
+        inp_deps.push_back({workspace_dep_var, DepType::VALUE});
+    mgr.register_shape_infer(self->output(1), {SourceType::DEP, inp_deps, infer_wk});
+}
 
 template <class MgbOpr, class MegDNNOpr>
 void mixin::ConvolutionBackwardDataMixin::init_output_static_infer_desc_for_bwd_data(
@@ -1535,6 +1587,226 @@ void BatchConvBiasForward::init_output_format() {
     output(0)->format(input(0)->format());
 }
 
+/* ========================== RegionRestrictedConvolutionForward
+ * ========================== */
+
+IMPL_CONV(RegionRestrictedConvolutionForward);
+
+RegionRestrictedConvolutionForward::RegionRestrictedConvolutionForward(
+        VarNode* src, VarNode* filter, VarNode* region_in, VarNode* region_out,
+        const Param& param, const OperatorNodeConfig& config)
+        : Super(src->owner_graph(), config, "region_restricted_conv_fwd",
+                {src, filter, region_in, region_out}) {
+    init_megdnn_opr(*this, param);
+    add_input({src, filter, region_in, region_out});
+}
+
+SymbolVar RegionRestrictedConvolutionForward::make(
+        SymbolVar src, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
+        const Param& param, const OperatorNodeConfig& config) {
+    return src.insert_single_output_opr<RegionRestrictedConvolutionForward>(
+            src.node(), filter.node(), region_in.node(), region_out.node(), param,
+            config);
+}
+
+void RegionRestrictedConvolutionForward::init_output_dtype() {
+    mgb_assert(
+            input(0)->dtype().category() == DTypeCategory::FLOAT,
+            "input dtype only support FLOAT, \
+            but got input dtype: %s",
+            input(0)->dtype().name());
+    output(0)->dtype(input(0)->dtype());
+    return;
+}
+
+size_t RegionRestrictedConvolutionForward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return megdnn_opr()->get_workspace_in_bytes(
+            {input_shapes[0], input(0)->dtype(), input(0)->format()},
+            {input_shapes[1], input(1)->dtype(), input(1)->format()},
+            {input_shapes[2], input(2)->dtype(), input(2)->format()},
+            {input_shapes[3], input(3)->dtype(), input(3)->format()},
+            {output_shapes[0], output(0)->dtype(), output(0)->format()});
+}
+
+#if MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionForward) {
+    mgb_assert(
+            opr.input(0)->dtype().category() == DTypeCategory::FLOAT &&
+                    opr.input(1)->dtype().category() == DTypeCategory::FLOAT &&
+                    opr.input(2)->dtype().category() == DTypeCategory::INT &&
+                    opr.input(3)->dtype().category() == DTypeCategory::INT,
+            "only float data type supported for grad");
+    if (wrt_idx == 0) {  // src
+        SymbolVar grad = RegionRestrictedConvolutionBackwardData::make(
+                opr.input(1),  // filter
+                out_grad[0],   // diff
+                opr.input(2),  // rin
+                opr.input(3),  // rout
+                opr.input(0),  // src
+                opr.param());
+        return grad.node();
+    }
+    // TODO: CUDA WGRAD UNIMPLEMENTED!
+    if (wrt_idx == 1) {  // filter
+        SymbolVar grad = RegionRestrictedConvolutionBackwardFilter::make(
+                opr.input(0),  // src
+                out_grad[0],   // diff
+                opr.input(2),  // rin
+                opr.input(3),  // rout
+                opr.input(1),  // filter
+                opr.param());
+        return grad.node();
+    }
+    return nullptr;
+}
+#endif
+
+/* ========================== RegionRestrictedConvolutionBackwardData
+ * ========================== */
+IMPL_CONV(RegionRestrictedConvolutionBackwardData);
+
+RegionRestrictedConvolutionBackwardData::RegionRestrictedConvolutionBackwardData(
+        VarNode* filter, VarNode* diff, VarNode* region_in, VarNode* region_out,
+        VarNode* src, const Param& param, const OperatorNodeConfig& config)
+        : Super{filter->owner_graph(),
+                config,
+                "region_restricted_conv_bwd_data",
+                {filter, diff, region_in, region_out}} {
+    init_megdnn_opr(*this, param);
+    add_input({filter, diff, region_in, region_out});
+    if (src)
+        add_input({src});
+}
+
+SymbolVar RegionRestrictedConvolutionBackwardData::make(
+        SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+        SymbolVar src, const Param& param, const OperatorNodeConfig& config) {
+    return filter.insert_single_output_opr<RegionRestrictedConvolutionBackwardData>(
+            filter.node(), diff.node(), region_in.node(), region_out.node(), src.node(),
+            param, config);
+}
+
+SymbolVar RegionRestrictedConvolutionBackwardData::make(
+        SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+        const Param& param, const OperatorNodeConfig& config) {
+    return make(filter, diff, region_in, region_out, {}, param, config);
+}
+
+void RegionRestrictedConvolutionBackwardData::init_output_static_infer_desc() {
+    init_output_static_infer_desc_for_bwd_data<
+            RegionRestrictedConvolutionBackwardData,
+            megdnn::RegionRestrictedConvolutionBackwardData>(this);
+}
+
+void RegionRestrictedConvolutionBackwardData::init_output_dtype() {
+    output(0)->dtype(input(0)->dtype());
+}
+
+void RegionRestrictedConvolutionBackwardData::scn_do_execute() {
+    megdnn_opr()->exec(
+            input(0)->dev_tensor().as_megdnn(),  // filter
+            input(1)->dev_tensor().as_megdnn(),  // diff
+            input(2)->dev_tensor().as_megdnn(),  // rin
+            input(3)->dev_tensor().as_megdnn(),  // rout
+            output(0)->dev_tensor().as_megdnn(),
+            intl::get_megdnn_workspace_from_var(output().back()));
+}
+
+cg::OperatorNodeBase::NodeProp* RegionRestrictedConvolutionBackwardData::
+        do_make_node_prop() const {
+    auto prop = Super::Super::do_make_node_prop();
+    if (input().size() == 5) {
+        using D = NodeProp::DepType;
+        prop->reset_dep_type(
+                input(),
+                {D::DEV_VALUE, D::DEV_VALUE, D::DEV_VALUE, D::DEV_VALUE, D::SHAPE});
+    }
+    return prop;
+}
+
+#if MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionBackwardData) {
+    if (wrt_idx == 0) {  // filter
+        return RegionRestrictedConvolutionBackwardFilter::make(
+                       out_grad[0], opr.input(1), opr.input(2), opr.input(3),
+                       opr.input(0), opr.param())
+                .node();
+    }
+    if (wrt_idx == 1) {  // diff
+        return RegionRestrictedConvolution::make(
+                       out_grad[0], opr.input(0), opr.input(2), opr.input(3),
+                       opr.param())
+                .node();
+    }
+    return nullptr;
+}
+#endif
+
+/* ========================== RegionRestrictedConvolutionBackwardFilter
+ * ========================== */
+IMPL_CONV(RegionRestrictedConvolutionBackwardFilter);
+
+RegionRestrictedConvolutionBackwardFilter::RegionRestrictedConvolutionBackwardFilter(
+        VarNode* src, VarNode* diff, VarNode* region_in, VarNode* region_out,
+        VarNode* filter, const Param& param, const OperatorNodeConfig& config)
+        : Super({src->owner_graph(),
+                 config,
+                 "region_restricted_conv_bwd_filter",
+                 {src, diff, region_in, region_out, filter}},
+                4, false) {
+    init_megdnn_opr(*this, param);
+    add_input({src, diff, region_in, region_out, filter});
+}
+
+SymbolVar RegionRestrictedConvolutionBackwardFilter::make(
+        SymbolVar src, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+        SymbolVar filter, const Param& param, const OperatorNodeConfig& config) {
+    return src.insert_single_output_opr<RegionRestrictedConvolutionBackwardFilter>(
+            src.node(), diff.node(), region_in.node(), region_out.node(), filter.node(),
+            param, config);
+}
+
+size_t RegionRestrictedConvolutionBackwardFilter::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return megdnn_opr()->get_workspace_in_bytes(
+            {input_shapes[0], input(0)->dtype(), input(0)->format()},
+            {input_shapes[1], input(1)->dtype(), input(1)->format()},
+            {input_shapes[2], input(2)->dtype(), input(2)->format()},
+            {input_shapes[3], input(3)->dtype(), input(3)->format()},
+            {output_shapes[0], output(0)->dtype(), output(0)->format()});
+}
+
+void RegionRestrictedConvolutionBackwardFilter::scn_do_execute() {
+    megdnn_opr()->exec(
+            input(0)->dev_tensor().as_megdnn(),  // src
+            input(1)->dev_tensor().as_megdnn(),  // diff
+            input(2)->dev_tensor().as_megdnn(),  // rin
+            input(3)->dev_tensor().as_megdnn(),  // rout
+            output(0)->dev_tensor().as_megdnn(),
+            intl::get_megdnn_workspace_from_var(output().back()));
+}
+#if MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionBackwardFilter) {
+    if (wrt_idx == 0) {
+        return RegionRestrictedConvolutionBackwardData::make(
+                       out_grad[0] /*filter*/, opr.input(1) /*diff*/,
+                       opr.input(2) /*rin*/, opr.input(3) /*rout*/,
+                       opr.input(0) /*src*/, opr.param())
+                .node();
+    }
+    if (wrt_idx == 1) {
+        return RegionRestrictedConvolution::make(
+                       opr.input(0) /*src*/, out_grad[0] /*filter*/,
+                       opr.input(2) /*rin*/, opr.input(3) /*rout*/, opr.param())
+                .node();
+    }
+    return nullptr;
+}
+#endif
+
 #undef IMPL_CONV
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/dnn/dnn.sereg.h b/src/opr/impl/dnn/dnn.sereg.h
index cef8825b..54a05d9e 100644
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -431,6 +431,7 @@ struct OprLoadDumpImpl<opr::Convolution3DBackwardFilter, 0>
                   MakeConvCallerEmpty<megdnn::Convolution3D>,
                   MakeConvCallerEmpty<megdnn::Convolution3D>,
                   megdnn::param::Convolution3D> {};
+
 template <>
 struct OprLoadDumpImpl<opr::ConvBiasForward, 0>
         : public ConvLoadDumpImpl<
diff --git a/src/opr/impl/dnn/dnn.sereg.v2.h b/src/opr/impl/dnn/dnn.sereg.v2.h
index bd8b467d..e57168b5 100644
--- a/src/opr/impl/dnn/dnn.sereg.v2.h
+++ b/src/opr/impl/dnn/dnn.sereg.v2.h
@@ -194,6 +194,30 @@ struct OprLoadDumpImplV2<opr::DeformableConvBackwardFilter, 0>
                   MakeConvCaller5<megdnn::DeformableConvBackwardFilter>,
                   megdnn::Convolution> {};
 
+template <>
+struct OprMaker<opr::RegionRestrictedConvolutionBackwardData, 0> {
+    using Opr = opr::RegionRestrictedConvolutionBackwardData;
+    using Param = Opr::Param;
+    static cg::OperatorNodeBase* make(
+            const Param& param, const cg::VarNodeArray& inputs, ComputingGraph& graph,
+            const OperatorNodeConfig& config) {
+        MGB_MARK_USED_VAR(graph);
+        if (inputs.size() == 4) {  // deconv mode
+            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, config)
+                    .node()
+                    ->owner_opr();
+        } else if (inputs.size() == 5) {  // dgrad mode
+            return Opr::make(
+                           inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], param,
+                           config)
+                    .node()
+                    ->owner_opr();
+        } else {
+            return nullptr;
+        }
+    }
+};
+
 }  // namespace serialization
 
 namespace opr {
@@ -220,6 +244,10 @@ SERGE_OPR_V2_NO_CONVERTER(Convolution3D, 0);
 SERGE_OPR_V2_NO_CONVERTER(Convolution3DBackwardData, 0);
 SERGE_OPR_V2_NO_CONVERTER(Convolution3DBackwardFilter, 0);
 
+MGB_SEREG_OPR(RegionRestrictedConvolutionBackwardData, 0);
+MGB_SEREG_OPR(RegionRestrictedConvolution, 4);
+MGB_SEREG_OPR(RegionRestrictedConvolutionBackwardFilter, 5);
+
 SERGE_OPR_V2_NO_CONVERTER(LocalShareForward, 0);
 SERGE_OPR_V2_NO_CONVERTER(LocalShareBackwardData, 0);
 SERGE_OPR_V2_NO_CONVERTER(LocalShareBackwardFilter, 0);
diff --git a/src/opr/include/megbrain/opr/dnn/convolution.h b/src/opr/include/megbrain/opr/dnn/convolution.h
index c0ca9f0f..337b4ef9 100644
--- a/src/opr/include/megbrain/opr/dnn/convolution.h
+++ b/src/opr/include/megbrain/opr/dnn/convolution.h
@@ -18,6 +18,12 @@ protected:
     static void init_output_static_infer_desc_for_bwd_data(cg::OperatorNodeBase* self);
 };
 
+class RegionConvBackwardDataMixin : public cg::OperatorNodeMixinBase {
+protected:
+    template <typename MGBOPR, typename DNNOPR>
+    static void init_output_static_infer_desc_for_bwd_data(cg::OperatorNodeBase* self);
+};
+
 class WeightPreprocessExecutor : public cg::OperatorNodeMixinBase {
     class PreprocessedFilterExecDep;
 
@@ -83,6 +89,80 @@ class ConvolutionTestingPeer;
 
 }  // namespace testing
 
+/* ==================== RegionRestrictedConvolutionForward  ==================== */
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        RegionRestrictedConvolutionForward,
+        intl::MegDNNOprWrapperFwd<megdnn::RegionRestrictedConvolutionForward>) // {
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
+    void init_output_dtype() override;
+
+public:
+    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionForward(
+            VarNode* src, VarNode* filter, VarNode* region_in, VarNode* region_out,
+            const Param& param, const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar src, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
+            const Param& param, const OperatorNodeConfig& config = {});
+};
+using RegionRestrictedConvolution = RegionRestrictedConvolutionForward;
+
+/* ==================== RegionRestrictedConvolutionBackwardData  ==================== */
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        RegionRestrictedConvolutionBackwardData,
+        cg::SingleCNOperatorNodeBaseT<mixin::MegDNNOprHolderImpl<
+                megdnn::RegionRestrictedConvolutionBackwardData>>,
+        public mixin::RegionConvBackwardDataMixin) // {
+    void scn_do_execute() override;
+    void init_output_static_infer_desc() override;
+    NodeProp* do_make_node_prop() const override;
+    void init_output_dtype() override;
+
+public:
+    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionBackwardData(
+            VarNode* filter, VarNode* diff, VarNode* region_in, VarNode* region_out,
+            VarNode* src, const Param& param, const OperatorNodeConfig& config);
+
+    // grad mode
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+            SymbolVar src, const Param& param, const OperatorNodeConfig& config = {});
+
+    // sereg for deconv mode
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+            const Param& param, const OperatorNodeConfig& config = {});
+
+    // user interface for deconv
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make_deconv(
+            SymbolVar data, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
+            const Param& param = {}, const OperatorNodeConfig& config = {}) {
+        return make(filter, data, region_in, region_out, param, config);
+    }
+};
+
+/* ==================== RegionRestrictedConvolutionBackwardFilter  ==================== */
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        RegionRestrictedConvolutionBackwardFilter,
+        intl::MegDNNOprWrapperBwd<megdnn::RegionRestrictedConvolutionBackwardFilter>) // {
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
+    void scn_do_execute() override;
+
+public:
+    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionBackwardFilter(
+            VarNode* src, VarNode* diff, VarNode* region_in, VarNode* region_out,
+            VarNode* filter, const Param& param, const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar src, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
+            SymbolVar filter, const Param& param,
+            const OperatorNodeConfig& config = {});
+};
+
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
         ConvolutionForward, intl::ConvolutionForwardBase,
         public mixin::AlgoChooserHelper) // {
diff --git a/src/opr/test/dnn/region_restricted_convolution.cpp b/src/opr/test/dnn/region_restricted_convolution.cpp
new file mode 100644
index 00000000..e7addc12
--- /dev/null
+++ b/src/opr/test/dnn/region_restricted_convolution.cpp
@@ -0,0 +1,196 @@
+#include "./legacy_checker.h"
+#include "megbrain/comp_node_env.h"
+
+#include "megbrain/gopt/inference.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/serialization/serializer.h"
+#include "megbrain/test/autocheck.h"
+#include "megbrain/test/helper.h"
+#include "megbrain/test/megdnn_helper.h"
+#include "megdnn/algorithm_cache.h"
+#include "megdnn/dtype.h"
+#include "megdnn/oprs/base.h"
+
+#include <gmock/gmock.h>
+
+#include <cmath>
+#include <memory>
+#include <random>
+
+using namespace mgb;
+
+TEST(TestOprDNN, REGIONCONV_FWD_CPU_WRAPPER) {
+    using Checker = AutoOprChecker<4, 1>;
+    megdnn::RegionRestrictedConvolution::Param param;
+    param.sparse = opr::RegionRestrictedConvolution::Param::Sparse::DENSE;
+
+    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
+        return {opr::RegionRestrictedConvolutionForward::make(
+                inputs[0], inputs[1], inputs[2], inputs[3], param)};
+    };
+
+    Checker::RunOptions option;
+    option.numdiff_eps = 0.1;
+    option.numdiff_max_err = 1e-2;
+
+    auto mask_gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Int32, RandomDistribution::CONSTANT> gen(1);
+        src = *gen(src.shape(), src.comp_node());
+    };
+    auto float_gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen;
+        src = *gen(src.shape(), src.comp_node());
+    };
+
+    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
+        auto opr =
+                megdnn_naive_handle()
+                        ->create_operator<megdnn::RegionRestrictedConvolutionForward>();
+        opr->param() = param;
+        TensorLayout dest_layout;
+        opr->deduce_layout(
+                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
+                dest_layout);
+        std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
+                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
+                dest_layout));
+        dest[0].dtype(inp[0]->dtype())
+                .comp_node(inp[0]->comp_node())
+                .resize(dest_layout);
+        opr->exec(
+                inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
+                inp[3]->as_megdnn(), dest[0].as_megdnn(),
+                {workspace.data(), workspace.size()});
+    };
+
+    Checker(make_graph, fwd, CompNode::load("cpu0"))
+            .set_input_dtype(0, dtype::Float32())
+            .set_input_dtype(1, dtype::Float32())
+            .set_input_dtype(2, dtype::Int32())
+            .set_input_dtype(3, dtype::Int32())
+            .set_input_generator(0, float_gen)
+            .set_input_generator(1, float_gen)
+            .set_input_generator(2, mask_gen)
+            .set_input_generator(3, mask_gen)
+            .set_input_allow_grad(2, false)
+            .set_input_allow_grad(3, false)
+            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
+            .run({TensorShape{1, 2, 2, 2}, TensorShape{1, 2, 2, 2},
+                  TensorShape{1, 2, 2}, TensorShape{1, 1, 1}},
+                 option)
+            .run({TensorShape{1, 2, 3, 3}, TensorShape{1, 2, 3, 3},
+                  TensorShape{1, 3, 3}, TensorShape{1, 1, 1}},
+                 option)
+            .run({TensorShape{1, 1, 4, 4}, TensorShape{1, 1, 2, 2},
+                  TensorShape{1, 4, 4}, TensorShape{1, 3, 3}},
+                 option)
+            .run({TensorShape{2, 2, 8, 8}, TensorShape{4, 2, 2, 2},
+                  TensorShape{2, 8, 8}, TensorShape{2, 7, 7}},
+                 option)
+            .run({TensorShape{4, 4, 8, 8}, TensorShape{4, 4, 2, 2},
+                  TensorShape{4, 8, 8}, TensorShape{4, 7, 7}},
+                 option);
+}
+
+#if MGB_CUDA
+TEST(TestOprDNN, REGIONCONV_FWD_GPU_WRAPPER) {
+    using Checker = AutoOprChecker<4, 1>;
+    megdnn::RegionRestrictedConvolution::Param param;
+    param.sparse = opr::RegionRestrictedConvolution::Param::Sparse::GROUP;
+
+    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
+        return {opr::RegionRestrictedConvolutionForward::make(
+                inputs[0], inputs[1], inputs[2], inputs[3], param)};
+    };
+
+    Checker::RunOptions option;
+    option.numdiff_eps = 0.1;
+    option.numdiff_max_err = 1e-2;
+
+    auto mask_gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Int32, RandomDistribution::CONSTANT> gen(1);
+        src = *gen(src.shape(), src.comp_node());
+    };
+    auto uint8_mask_gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Uint8, RandomDistribution::CONSTANT> gen(1);
+        src = *gen(src.shape(), src.comp_node());
+    };
+    auto float_gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen;
+        src = *gen(src.shape(), src.comp_node());
+    };
+
+    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
+        auto opr =
+                megdnn_naive_handle()
+                        ->create_operator<megdnn::RegionRestrictedConvolutionForward>();
+        opr->param() = param;
+        TensorLayout dest_layout;
+        opr->deduce_layout(
+                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
+                dest_layout);
+        std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
+                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
+                dest_layout));
+        dest[0].dtype(inp[0]->dtype())
+                .comp_node(inp[0]->comp_node())
+                .resize(dest_layout);
+        opr->exec(
+                inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
+                inp[3]->as_megdnn(), dest[0].as_megdnn(),
+                {workspace.data(), workspace.size()});
+    };
+
+    Checker(make_graph, fwd, CompNode::load("gpu0"))
+            .set_input_dtype(0, dtype::Float32())
+            .set_input_dtype(1, dtype::Float32())
+            .set_input_dtype(2, dtype::Int32())
+            .set_input_dtype(3, dtype::Int32())
+            .set_input_generator(0, float_gen)
+            .set_input_generator(1, float_gen)
+            .set_input_generator(2, mask_gen)
+            .set_input_generator(3, mask_gen)
+            .set_input_allow_grad(2, false)
+            .set_input_allow_grad(3, false)
+            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
+            .run({TensorShape{1, 2, 2, 2}, TensorShape{2, 1, 1, 2, 2},
+                  TensorShape{1, 2, 2}, TensorShape{1, 1, 1}},
+                 option)
+            .run({TensorShape{1, 2, 3, 3}, TensorShape{2, 1, 1, 3, 3},
+                  TensorShape{1, 3, 3}, TensorShape{1, 1, 1}},
+                 option)
+            .run({TensorShape{1, 4, 4, 4}, TensorShape{4, 1, 1, 2, 2},
+                  TensorShape{1, 4, 4}, TensorShape{1, 3, 3}},
+                 option)
+            .run({TensorShape{2, 4, 8, 8}, TensorShape{4, 1, 1, 2, 2},
+                  TensorShape{2, 8, 8}, TensorShape{2, 7, 7}},
+                 option);
+
+    Checker(make_graph, fwd, CompNode::load("gpu0"))
+            .set_input_dtype(0, dtype::Float32())
+            .set_input_dtype(1, dtype::Float32())
+            .set_input_dtype(2, dtype::Uint8())
+            .set_input_dtype(3, dtype::Uint8())
+            .set_input_generator(0, float_gen)
+            .set_input_generator(1, float_gen)
+            .set_input_generator(2, uint8_mask_gen)
+            .set_input_generator(3, uint8_mask_gen)
+            .set_input_allow_grad(2, false)
+            .set_input_allow_grad(3, false)
+            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
+            .run({TensorShape{1, 2, 4, 4}, TensorShape{2, 1, 1, 1, 1},
+                  TensorShape{1, 4, 4}, TensorShape{1, 4, 4}},
+                 option)
+            .run({TensorShape{1, 2, 8, 8}, TensorShape{2, 1, 1, 1, 1},
+                  TensorShape{1, 8, 8}, TensorShape{1, 8, 8}},
+                 option)
+            .run({TensorShape{1, 4, 8, 8}, TensorShape{4, 1, 1, 5, 5},
+                  TensorShape{1, 8, 8}, TensorShape{1, 4, 4}},
+                 option)
+            .run({TensorShape{2, 4, 8, 8}, TensorShape{4, 1, 1, 1, 1},
+                  TensorShape{2, 8, 8}, TensorShape{2, 8, 8}},
+                 option);
+}
+#endif