perf(imperative): specialize adaptive pooling

GitOrigin-RevId: 01e1418458
3 years ago · c2435d1561
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -191,7 +191,7 @@ bool TensorShape::is_empty() const {
            return true;
        }
    }
    return false;
    return ndim == 0;
 }

 /* ===================== TensorLayout =====================  */
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -11,7 +11,12 @@ from functools import lru_cache
 from typing import NamedTuple, Optional, Sequence, Tuple, Union

 from ..core import _config
 from ..core._imperative_rt.core2 import Const, apply, dtype_promotion
 from ..core._imperative_rt.core2 import (
    Const,
    adaptive_pool2d_cpp,
    apply,
    dtype_promotion,
 )
 from ..core._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder
 from ..core._imperative_rt.ops import get_global_rng_seed as _get_global_rng_seed
 from ..core.ops import builtin
@@ -691,19 +696,12 @@ def adaptive_max_pool2d(

    Args:
        inp: input tensor.
        oshp: OH, OW)` size of the output shape.
        oshp: `(OH, OW)` size of the output shape.

    Returns:
        output tensor.
    """
    if isinstance(oshp, int):
        oshp = (oshp, oshp)
    conv_format = _config._get_actual_op_param("NCHW", _config.__conv_format)

    op = builtin.AdaptivePooling(mode="max", format=conv_format,)
    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
    (output,) = apply(op, inp, oshp)
    return output
    return adaptive_pool2d_cpp(inp, oshp, "MAX")


 def adaptive_avg_pool2d(
@@ -715,18 +713,12 @@ def adaptive_avg_pool2d(

    Args:
        inp: input tensor.
        oshp: OH, OW)` size of the output shape.
        oshp: `(OH, OW)` size of the output shape.

    Returns:
        output tensor.
    """
    if isinstance(oshp, int):
        oshp = (oshp, oshp)

    op = builtin.AdaptivePooling(mode="average", format="NCHW",)
    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
    (output,) = apply(op, inp, oshp)
    return output
    return adaptive_pool2d_cpp(inp, oshp, "AVERAGE")


 def deformable_psroi_pooling(
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -430,6 +430,7 @@ WRAP_FUNC_PY35(squeeze_cpp);
 WRAP_FUNC_PY35(transpose_cpp);
 WRAP_FUNC_PY35(broadcast_cpp);
 WRAP_FUNC_PY35(reshape_cpp);
 WRAP_FUNC_PY35(adaptive_pool2d_cpp);
 WRAP_FUNC_PY35(Const);
 WRAP_FUNC_PY35(astype_cpp);
 WRAP_FUNC_PY35(convert_single_value_cpp);
@@ -584,6 +585,7 @@ void init_tensor(py::module m) {
            MGE_PY_INTERFACE(transpose_cpp, transpose_cpp),
            MGE_PY_INTERFACE(broadcast_cpp, broadcast_cpp),
            MGE_PY_INTERFACE(reshape_cpp, reshape_cpp),
            MGE_PY_INTERFACE(adaptive_pool2d_cpp, adaptive_pool2d_cpp),
            MGE_PY_INTERFACE(Const, Const),
            MGE_PY_INTERFACE(astype_cpp, astype_cpp),
            MGE_PY_INTERFACE(convert_single_value_cpp, convert_single_value_cpp),
@@ -991,8 +993,10 @@ void init_tensor(py::module m) {

    m.def("is_tracing_module", [=] { return get_module_trace()->enabled(); });

    m.def("set_module_trace_hook",
          [](py::function function) { module_trace_hook = function; });
    m.def("set_module_trace_hook", [](py::function function) {
        module_trace_hook = function;
        module_trace_hook.inc_ref();
    });

    m.def("begin_record_values", [] { Value::begin_record_values(); });

--- a/imperative/python/src/tensor_utils.cpp
+++ b/imperative/python/src/tensor_utils.cpp
@@ -948,6 +948,7 @@ std::tuple<std::vector<int32_t>, bool> tuple2vector(py::object shape) {
    py::tuple tup = py::reinterpret_borrow<py::tuple>(shape);
    for (size_t i = 0; i < tup.size(); ++i) {
        if (!PyLong_Check(tup[i].ptr())) {
            shp.clear();
            return {shp, false};
        } else {
            shp.push_back(tup[i].cast<int32_t>());
@@ -1108,6 +1109,52 @@ py::object _reshape_cpp(py::handle inp_hdl, py::handle args) {
    return ret[0];
 }

 py::object _adaptive_pool2d_cpp(
        py::handle inp_hdl, py::handle shape_val_hdl, py::handle pool_mode_hdl) {
    py::object shape_hdl = py::reinterpret_borrow<py::object>(shape_val_hdl);
    py::list shps(0);
    if (!PyTuple_Check(shape_val_hdl.ptr())) {
        shps.append(PyLong_AsLong(shape_val_hdl.ptr()));
        shps.append(PyLong_AsLong(shape_val_hdl.ptr()));

        shape_hdl = py::reinterpret_borrow<py::object>(shps);
    }
    py::object shape_tuple;
    try {
        shape_tuple = _make_shape_tuple(shape_hdl);
    } catch (py::error_already_set& err) {
        shape_tuple = py::reinterpret_borrow<py::object>(shape_hdl);
    }
    auto mode_string = pool_mode_hdl.cast<std::string>();
    ::megdnn::param::AdaptivePooling::Mode pool_mode =
            ::megdnn::param::AdaptivePooling::Mode::MAX;
    if (mode_string.compare(std::string("AVERAGE")) == 0) {
        pool_mode = ::megdnn::param::AdaptivePooling::Mode::AVERAGE;
    }
    auto [shape, fastpath] = tuple2vector(shape_tuple);
    fastpath &= enable_fastpath(inp_hdl);
    std::shared_ptr<OpDef> op;
    std::vector<PyObject*> p;
    py::object shape_tensor;
    op = AdaptivePooling::make(
            pool_mode, ::megdnn::param::AdaptivePooling::Format::NCHW, shape);
    if (fastpath) {
        p.resize(2);
    } else {
        p.resize(3);
        shape_tensor = _astensor1d_cpp(
                shape_hdl, py::cast((mgb::DType)dtype::Int32()),
                getattr(inp_hdl, "device"), inp_hdl);
        p[2] = shape_tensor.ptr();
    }
    py::object Op = py::cast(op);
    p[0] = Op.ptr();
    p[1] = inp_hdl.ptr();
    py::tuple ret =
            py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
    return ret[0];
 }

 py::object _getitem_cpp(py::handle inp_hdl, py::handle idx_hdl) {
    py::tuple try_res = _try_cond_take(inp_hdl, idx_hdl);
    if (try_res.size() == 2) {
@@ -1506,6 +1553,13 @@ PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
    PYEXT17_TRANSLATE_EXC_RET(nullptr)
 }

 PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
    try {
        return _adaptive_pool2d_cpp(args[0], args[1], args[2]).release().ptr();
    }
    PYEXT17_TRANSLATE_EXC_RET(nullptr)
 }

 PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs) {
    try {
        return _Const(args[0], args[1], args[2], args[3]).release().ptr();
--- a/imperative/python/src/tensor_utils.h
+++ b/imperative/python/src/tensor_utils.h
@@ -24,6 +24,8 @@ PyObject* broadcast_cpp(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* astype_cpp(PyObject* self, PyObject* const* args, size_t nargs);
--- a/imperative/src/impl/ops/adaptive_pooling.cpp
+++ b/imperative/src/impl/ops/adaptive_pooling.cpp
@@ -0,0 +1,129 @@
 #include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "../algo_chooser.h"
 #include "../blob_manager_impl.h"
 #include "../dnn_op_helper.h"
 #include "../op_trait.h"
 #include "megbrain/imperative/ops/autogen.h"
 #include "megbrain/opr/io.h"

 namespace mgb::imperative {

 namespace {
 namespace adaptive_pooling {
 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
    auto&& pool = static_cast<const AdaptivePooling&>(def);
    OperatorNodeConfig config{pool.make_name()};
    size_t nr_inp = inputs.size();
    if (nr_inp > 1) {
        return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config);
    }

    HostTensorND hv = HostTensorND(inputs[0]->comp_node(), {2}, dtype::Int32());
    auto* ptr = hv.ptr<dt_int32>();
    ptr[0] = pool.shape[0];
    ptr[1] = pool.shape[1];
    auto graph = inputs[0]->owner_graph();
    auto target_shape = opr::ImmutableTensor::make(*graph, hv, config);
    return opr::AdaptivePooling::make(inputs[0], target_shape, pool.param(), config);
 }

 std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
    auto&& pool = static_cast<const AdaptivePooling&>(def);
    size_t nr_inp = inputs.size();
    auto&& src = inputs[0];
    TensorLayout dst_layout(src.layout.dtype);
    if (src.layout.is_empty()) {
        return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false};
    }

    dst_layout.ndim = 4u;
    if (nr_inp == 1) {
        dst_layout[0] = src.layout[0];
        dst_layout[1] = src.layout[1];
        dst_layout[2] = pool.shape[0];
        dst_layout[3] = pool.shape[1];
    } else {
        auto&& tshp = inputs[1];
        if (tshp.value.empty()) {
            return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false};
        }
        mgb_assert(
                tshp.layout.ndim == 1,
                "target shape of AdaptivePooling expects ndim=1; got ndim=%lu actually",
                tshp.layout.ndim);
        dst_layout[0] = src.layout[0];
        dst_layout[1] = src.layout[1];
        auto* ptr = tshp.value.ptr<dt_int32>();
        dst_layout[2] = ptr[0];
        dst_layout[3] = ptr[1];
    }
    dst_layout.init_contiguous_stride();
    return {{{dst_layout, src.comp_node}}, true};
 }

 SmallVector<TensorPtr> apply_on_physical_tensor(
        const OpDef& def, const SmallVector<TensorPtr>& inputs,
        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
    auto&& pool = static_cast<const AdaptivePooling&>(def);
    auto&& cn = inputs[0]->comp_node();

    using TensorND = megdnn::TensorND;
    auto&& src_layout = inputs[0]->layout();
    TensorLayout dst_layout = output_descs[0].layout;
    if (!validated) {
        TensorShape tshp;
        dst_layout[0] = src_layout[0];
        dst_layout[1] = src_layout[1];
        if (inputs.size() == 2) {
            auto&& tshp_nd = inputs[1];
            cg::copy_tensor_value_to_shape(
                    tshp, tshp_nd->get_value().proxy_to_default_cpu());
            dst_layout[2] = tshp[0];
            dst_layout[3] = tshp[1];
        } else {
            dst_layout[2] = pool.shape[0];
            dst_layout[3] = pool.shape[1];
        }
        dst_layout.init_contiguous_stride();
    }

    size_t IH = src_layout[2], IW = src_layout[3], OH = dst_layout[2],
           OW = dst_layout[3];
    DnnOprCaller<megdnn::Pooling> dnn_opr(cn);
    auto&& param = dnn_opr.op->param();
    param.mode = pool.mode;
    param.format = pool.format;
    param.pad_h = param.pad_w = 0;
    param.stride_h = floor(IH / OH);
    param.stride_w = floor(IW / OW);
    param.window_h = IH - (OH - 1) * param.stride_h;
    param.window_w = IW - (OW - 1) * param.stride_w;

    TensorND src = inputs[0]->dnn_tensor();
    DeviceTensorND dst =
            BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout);

    size_t sz = setup_algo<megdnn::Pooling>(
            {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
            ::megdnn::param::ExecutionPolicy{}, false);

    megdnn::Workspace dnn_wk;
    if (sz) {
        TensorLayout w_layout({sz}, dtype::Byte());
        dnn_wk = dnn_opr.create_workspace(w_layout);
    }
    dnn_opr.op->exec(src, dst.as_megdnn(), dnn_wk);

    return {Tensor::make(dst)};
 }

 OP_TRAIT_REG(AdaptivePooling, AdaptivePooling)
        .apply_on_var_node(apply_on_var_node)
        .infer_output_attrs_fallible(infer_output_attrs_fallible)
        .apply_on_physical_tensor(apply_on_physical_tensor)
        .fallback();
 }  // namespace adaptive_pooling
 }  // namespace

 }  // namespace mgb::imperative
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
@@ -294,20 +294,6 @@ OP_TRAIT_REG(TopK, TopK).apply_on_var_node(apply_on_var_node).fallback();
 }  // namespace

 namespace {
 namespace adaptive_pooling {
 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
    auto&& pool = static_cast<const AdaptivePooling&>(def);
    OperatorNodeConfig config{pool.make_name()};
    return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config);
 }

 OP_TRAIT_REG(AdaptivePooling, AdaptivePooling)
        .apply_on_var_node(apply_on_var_node)
        .fallback();
 }  // namespace adaptive_pooling
 }  // namespace

 namespace {
 namespace batch_conv_bias {
 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
    auto&& conv = static_cast<const BatchConvBias&>(def);
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -69,7 +69,11 @@ def GroupLocal: MgbHashableOp<"GroupLocal", [ConvolutionParam]>;

 def Pooling: MgbHashableOp<"Pooling", [PoolingParam, ExecutionPolicyParamBase<"policy">]>;

 def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]>;
 def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]> {
  let extraArguments = (ins
    MgbArrayAttr<MgbI32Attr>:$shape
  );
 }

 def ROIPooling: MgbHashableOp<"ROIPooling", [ROIPoolingParam]>;