Browse Source

perf(imperative): enable memory forwarding for imperative

GitOrigin-RevId: 7c1993979c
tags/v1.9.0
Megvii Engine Team 3 years ago
parent
commit
e400b7ffe5
35 changed files with 1136 additions and 681 deletions
  1. +2
    -1
      dnn/include/megdnn/basic_types.h
  2. +1
    -1
      dnn/src/common/basic_types.cpp
  3. +2
    -1
      dnn/src/common/opr_delegate.cpp
  4. +1
    -575
      imperative/python/src/tensor.cpp
  5. +2
    -0
      imperative/python/src/tensor.h
  6. +630
    -0
      imperative/python/src/tensor_utils.cpp
  7. +11
    -0
      imperative/python/src/tensor_utils.h
  8. +13
    -3
      imperative/src/impl/interpreter/interpreter_impl.cpp
  9. +1
    -0
      imperative/src/impl/mgb_cg_impl.h
  10. +5
    -0
      imperative/src/impl/op_def.cpp
  11. +8
    -0
      imperative/src/impl/op_trait.cpp
  12. +9
    -0
      imperative/src/impl/op_trait.h
  13. +2
    -2
      imperative/src/impl/opr_utility.cpp
  14. +39
    -27
      imperative/src/impl/ops/broadcast.cpp
  15. +15
    -5
      imperative/src/impl/ops/elemwise.cpp
  16. +12
    -1
      imperative/src/impl/ops/reduce.cpp
  17. +8
    -0
      imperative/src/impl/ops/rng.cpp
  18. +95
    -2
      imperative/src/impl/ops/specializations.cpp
  19. +1
    -1
      imperative/src/impl/ops/utility.cpp
  20. +78
    -15
      imperative/src/impl/physical_tensor.cpp
  21. +2
    -5
      imperative/src/impl/proxy_graph.cpp
  22. +15
    -16
      imperative/src/impl/proxy_graph/mini_graph.h
  23. +81
    -0
      imperative/src/impl/proxy_graph/proxy_graph.cpp
  24. +15
    -0
      imperative/src/impl/subgraph_detail.cpp
  25. +3
    -0
      imperative/src/include/megbrain/imperative/op_def.h
  26. +21
    -8
      imperative/src/include/megbrain/imperative/physical_tensor.h
  27. +3
    -0
      imperative/src/include/megbrain/imperative/proxy_graph_detail.h
  28. +3
    -0
      imperative/src/include/megbrain/imperative/subgraph_detail.h
  29. +1
    -1
      src/core/impl/graph/cg_impl.cpp
  30. +4
    -3
      src/core/impl/graph/cg_impl.h
  31. +30
    -1
      src/core/impl/graph/var_node.cpp
  32. +1
    -1
      src/core/impl/tensor.cpp
  33. +2
    -1
      src/core/include/megbrain/graph/cg.h
  34. +7
    -0
      src/core/include/megbrain/graph/var_node.h
  35. +13
    -11
      src/opr/include/megbrain/opr/io.h

+ 2
- 1
dnn/include/megdnn/basic_types.h View File

@@ -285,7 +285,8 @@ struct TensorLayout : public TensorShape {
* stride * stride
*/ */
void add_axis_cont_inplace(size_t axis) { void add_axis_cont_inplace(size_t axis) {
add_axis_inplace(axis, 1, stride[axis] * shape[axis]);
ptrdiff_t stride_ = axis < ndim ? stride[axis] * shape[axis] : 1;
add_axis_inplace(axis, 1, stride_);
} }


/*! /*!


+ 1
- 1
dnn/src/common/basic_types.cpp View File

@@ -382,7 +382,7 @@ bool TensorLayout::eq_layout(const TensorLayout& rhs) const {
MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code"); MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");


auto ax = [](size_t shape0, size_t shape1, ptrdiff_t stride0, ptrdiff_t stride1) { auto ax = [](size_t shape0, size_t shape1, ptrdiff_t stride0, ptrdiff_t stride1) {
return (shape0 == shape1) & ((shape0 == 1) | (stride0 == stride1));
return (shape0 == shape1) & ((shape0 <= 1) | (stride0 == stride1));
}; };
if (ndim == rhs.ndim) { if (ndim == rhs.ndim) {
size_t eq = 0; size_t eq = 0;


+ 2
- 1
dnn/src/common/opr_delegate.cpp View File

@@ -13,7 +13,8 @@


using namespace megdnn; using namespace megdnn;


const std::shared_ptr<Handle>& megdnn::inplace_cpu_handle(int debug_level) {
MGE_WIN_DECLSPEC_FUC const std::shared_ptr<Handle>& megdnn::inplace_cpu_handle(
int debug_level) {
auto make = [](int deb_level) { auto make = [](int deb_level) {
megcoreDeviceHandle_t dev_handle; megcoreDeviceHandle_t dev_handle;
megcoreCreateDeviceHandle(&dev_handle, megcorePlatformCPU); megcoreCreateDeviceHandle(&dev_handle, megcorePlatformCPU);


+ 1
- 575
imperative/python/src/tensor.cpp View File

@@ -32,6 +32,7 @@
#include "./module_trace.h" #include "./module_trace.h"
#include "./numpy_dtypes.h" #include "./numpy_dtypes.h"
#include "./tensor.h" #include "./tensor.h"
#include "./tensor_utils.h"
#include "./transformation.h" #include "./transformation.h"


#include <object.h> #include <object.h>
@@ -549,557 +550,6 @@ CompNode _get_device(PyObject* const* args, size_t nargs) {
return cn; return cn;
} }


bool is_scalar(PyObject* tensor) {
if (py::isinstance<PySymbolVar>(py::handle(tensor))) {
auto var = py::handle(tensor).cast<PySymbolVar*>();
return var->is_scalar;
}
auto* tw = TensorWrapper::try_cast(tensor);
if (tw) {
return tw->m_tensor->is_scalar();
}
return PyArray_CheckAnyScalar(tensor);
}

bool is_bool_list(PyObject* arg) {
if (!PyList_Check(arg)) {
return false;
}
size_t sz = PyList_Size(arg);
if (!sz) {
return false;
}
for (size_t i = 0; i < sz; ++i) {
PyObject* handle = PyList_GetItem(arg, i);
if (!PyBool_Check(handle)) {
return false;
}
}
return true;
}

bool is_bool_dtype(PyObject* args) {
if (!PyObject_HasAttrString(args, "dtype"))
return false;
PyObject* dobj = PyObject_GetAttrString(args, "dtype");
PyArray_Descr* dtype;
PyArray_DescrConverter(dobj, &dtype);
bool ret = (dtype->kind == 'b');
Py_XDECREF(dtype);
Py_XDECREF(dobj);
return ret;
}

py::object _Const(
py::handle value, py::handle dtype, py::handle device, py::handle ref) {
py::object val = py::reinterpret_borrow<py::object>(value);
if (PyArray_Check(value.ptr())) {
py::tuple strides =
py::reinterpret_borrow<py::tuple>(getattr(value, "strides"));
bool need_squeeze = false;
for (size_t i = 0; i < strides.size(); ++i) {
if (strides[i].cast<ptrdiff_t>() == 0) {
need_squeeze = true;
}
}
if (need_squeeze) {
val = py::reinterpret_borrow<py::array>(value);
val = val.attr("squeeze")();
val = val.attr("reshape")(val.attr("shape"));
}
}
if (py::isinstance<PySymbolVar>(ref)) {
auto ref_var = ref.cast<PySymbolVar*>();
auto* graph = ref_var->m_node->owner_graph();
auto cn = device.cast<CompNode>();
OperatorNodeConfig config(cn);
auto hv = npy::np2tensor(
val.ptr(), npy::Meth::borrow(cn), dtype.cast<mgb::DType>());
auto typeobj = ref.get_type();
return typeobj(opr::ImmutableTensor::make(*graph, hv, config).node());
}
py::tuple tup = py::make_tuple(val, dtype, device, true, false, py::none());
return TensorWrapper::make(py_tensor_type, tup.ptr(), nullptr);
}

py::tuple _make_shape_tuple(py::handle shape) {
py::list orig;
py::list ret(0);
auto solve_one = [&](py::handle val) {
if (TensorWrapper::try_cast(val.ptr()) || py::isinstance<PySymbolVar>(val)) {
py::object np = getattr(val, "numpy")();
PyArrayObject* arr = (PyArrayObject*)np.ptr();
PyObject* maybe_list = PyArray_ToList(arr);
if (PyList_Check(maybe_list)) {
py::list may = py::reinterpret_steal<py::list>(maybe_list);
for (size_t i = 0; i < may.size(); ++i) {
ret.append(may[i]);
}
} else {
mgb_assert(PyLong_Check(maybe_list));
ret.append(PyLong_AsLong(maybe_list));
Py_XDECREF(maybe_list);
}
} else if (PyArray_Check(val.ptr())) {
ret.append(PyArray_PyIntAsInt(val.ptr()));
} else {
ret.append(PyLong_AsLong(val.ptr()));
}
};
if (PyArray_Check(shape.ptr()) && !PyArray_CheckAnyScalar(shape.ptr())) {
orig = py::reinterpret_steal<py::list>(
PyArray_ToList((PyArrayObject*)shape.ptr()));
for (size_t i = 0; i < orig.size(); ++i) {
solve_one(orig[i]);
}
} else if (PyList_Check(shape.ptr())) {
orig = py::reinterpret_borrow<py::list>(shape);
for (size_t i = 0; i < orig.size(); ++i) {
solve_one(orig[i]);
}
} else if (PyTuple_Check(shape.ptr())) {
py::tuple tup = py::reinterpret_borrow<py::tuple>(shape);
for (size_t i = 0; i < tup.size(); ++i) {
solve_one(tup[i]);
}
} else {
solve_one(shape);
}
return py::reinterpret_steal<py::tuple>(PyList_AsTuple(ret.ptr()));
}

py::object _get_index(py::object tensor, py::object src) {
if (!TensorWrapper::try_cast(tensor.ptr()) &&
!py::isinstance<PySymbolVar>(tensor)) {
auto get_const = [&](mgb::DType dtype) -> py::object {
return _Const(tensor, py::cast(dtype), src.attr("device"), src);
};
if (is_bool_list(tensor.ptr()) || is_bool_dtype(tensor.ptr())) {
tensor = get_const(dtype::Bool());
} else {
tensor = get_const(dtype::Int32());
}
if (!is_bool_dtype(tensor.ptr())) {
return tensor;
}
} else {
if (!is_bool_dtype(tensor.ptr())) {
return tensor;
}
}
static std::shared_ptr<OpDef> op = CondTake::make();
std::vector<PyObject*> p;
p.resize(3);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
p[2] = tensor.ptr();
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret[1];
}

py::tuple _try_cond_take(py::handle tensor, py::handle index) {
if (!hasattr(index, "dtype") || !hasattr(index, "shape")) {
return py::tuple();
}
if (!is_bool_dtype(index.ptr()) ||
_make_shape_tuple(getattr(index, "shape"))
.not_equal(_make_shape_tuple(getattr(tensor, "shape")))) {
return py::tuple();
}
py::object iobj;
if (PyArray_Check(index.ptr())) {
iobj =
_Const(index, py::cast((mgb::DType)dtype::Bool()),
getattr(tensor, "device"), tensor);
} else {
iobj = py::reinterpret_borrow<py::object>(index);
}
static std::shared_ptr<OpDef> op = CondTake::make();
std::vector<PyObject*> p;
p.resize(3);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
p[2] = iobj.ptr();
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret;
}

py::tuple _remove_ellipsis(py::object tensor, py::tuple tuple_val) {
size_t tuple_size = tuple_val.size();
size_t ndim_sum = 0, cur_sum = 0;
int pos = -1;
bool has_unknown_ndim_bool_index = false;
for (size_t i = 0; i < tuple_size; ++i) {
py::object handle = tuple_val[i];
if (handle.ptr() == Py_Ellipsis) {
pos = static_cast<int>(i);
for (size_t j = 0; j < i; ++j) {
py::object t = tuple_val[j];
if (t.ptr() == Py_Ellipsis) {
throw py::index_error("only one ellipsis is allowed.");
}
}
} else {
size_t ndim_incr = 1;
if (hasattr(handle, "dtype") && is_bool_dtype(handle.ptr()) &&
hasattr(handle, "ndim")) {
py::object ndim = getattr(handle, "ndim");
if (PyLong_Check(ndim.ptr())) {
ndim_incr = PyLong_AsLong(ndim.ptr());
} else {
has_unknown_ndim_bool_index = true;
}
}
cur_sum += ndim_incr;
}
}
if (pos == -1) {
return tuple_val;
} else {
if (has_unknown_ndim_bool_index) {
throw py::index_error(
"does not support bool index with unknown shape when using "
"Ellipsis.");
}
try {
ndim_sum = getattr(tensor, "ndim").cast<size_t>();
} catch (py::error_already_set& err) {
throw py::index_error(
"does not support Ellipsis when tensor's ndim is unknown.");
}
py::tuple ret(ndim_sum - cur_sum + tuple_size - 1);
size_t idx = 0;
for (size_t i = 0; i < tuple_size; ++i) {
if (i == pos) {
for (size_t j = cur_sum; j < ndim_sum; ++j) {
ret[idx++] = PySlice_New(NULL, NULL, NULL);
}
} else {
ret[idx++] = tuple_val[i];
}
}
return ret;
}
}

py::tuple _expand_bool_dim(py::object tensor, py::tuple tuple_val) {
py::tuple cur_shape = _make_shape_tuple(py::handle(getattr(tensor, "shape")));
py::list new_tuple_val(0);

size_t offset = 0;
size_t tdim = 0;
for (size_t i = 0; i < tuple_val.size(); ++i) {
py::handle k = tuple_val[i];
if (is_bool_dtype(k.ptr())) {
size_t ndim = getattr(k, "ndim").cast<size_t>();
if (ndim > 1) {
py::tuple ishape = _make_shape_tuple(py::handle(getattr(k, "shape")));
for (size_t j = 0; j < ndim; ++j) {
if (cur_shape[tdim + j - offset].cast<size_t>() !=
ishape[j].cast<size_t>()) {
std::string msg =
"boolean index did not match tensor along dimension " +
std::to_string(tdim + j) + "; dimension is " +
std::to_string(
cur_shape[tdim + j - offset].cast<size_t>()) +
" but corresponding boolean dimension is " +
std::to_string(ishape[j].cast<size_t>());
throw py::index_error(msg.c_str());
}
}
py::object new_k = getattr(k, "reshape")(-1);
py::object kshape = getattr(new_k, "shape");
py::list new_shape(0);
PyObject* sym = PyObject_CallObject(cpp_use_symbolic_shape, nullptr);
bool is_sym = (sym == Py_True);
Py_XDECREF(sym);
if (is_sym) {
py::object tshape = getattr(tensor, "shape");
for (size_t j = 0; j < i; ++j) {
new_shape.append(tshape[py::int_(j)]);
}
new_shape.append(kshape[py::int_(0)]);
for (size_t j = tdim + ndim - offset; j < cur_shape.size(); ++j) {
new_shape.append(cur_shape[j]);
}
py::tuple args = py::make_tuple(new_shape);
PyObject* shape_tensor =
PyObject_CallObject(cpp_astensor1d, args.ptr());
py::object reshape_func = getattr(tensor, "reshape");
Py_INCREF(shape_tensor);
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, shape_tensor);
PyObject* new_tensor =
PyObject_CallObject(reshape_func.ptr(), Args);
Py_XDECREF(Args);
tensor = py::reinterpret_steal<py::object>(new_tensor);
cur_shape = _make_shape_tuple(py::handle(shape_tensor));
Py_XDECREF(shape_tensor);
} else {
for (size_t j = 0; j < i; ++j) {
new_shape.append(cur_shape[j]);
}
new_shape.append(py::reinterpret_borrow<py::tuple>(kshape)[0]);
for (size_t j = tdim + ndim - offset; j < cur_shape.size(); ++j) {
new_shape.append(cur_shape[j]);
}
cur_shape = new_shape;
tensor = getattr(tensor, "reshape")(cur_shape);
}
offset++;
tdim += ndim;
}
new_tuple_val.append(k);
} else {
new_tuple_val.append(k);
tdim++;
}
}
return py::make_tuple(tensor, py::reinterpret_borrow<py::tuple>(new_tuple_val));
}

py::tuple _unpack_indexes(py::handle inp_hdl, py::handle idx_hdl) {
py::object inp = py::reinterpret_borrow<py::object>(inp_hdl);
py::tuple tuple_val;
if (py::isinstance<py::tuple>(idx_hdl)) {
tuple_val = py::reinterpret_borrow<py::tuple>(idx_hdl);
} else {
tuple_val = py::make_tuple(idx_hdl);
}

bool use_subtensor = true;
bool need_remove_ellipsis = false;
bool need_expand_bool_dim = false;
size_t idx_ndim = 0;
for (size_t i = 0; i < tuple_val.size(); ++i) {
py::object k = tuple_val[i];
if (k.ptr() == Py_None) {
throw py::index_error("newaxis is not allowed here");
} else if (k.ptr() == Py_Ellipsis) {
need_remove_ellipsis = true;
} else {
if (is_bool_dtype(k.ptr()) && hasattr(k, "ndim")) {
size_t ndim = getattr(k, "ndim").cast<size_t>();
idx_ndim += ndim;
if (ndim > 1) {
need_expand_bool_dim = true;
}
} else {
idx_ndim++;
}
}
}
try {
size_t inp_ndim = getattr(inp, "ndim").cast<size_t>();
if (idx_ndim > inp_ndim) {
std::string msg = "too many indices for tensor: tensor is " +
std::to_string(inp_ndim) + "-dimensional, but " +
std::to_string(idx_ndim) + " were indexed";
throw py::index_error(msg.c_str());
}
} catch (py::error_already_set& err) {
; // ignore
}
if (need_remove_ellipsis) {
tuple_val = _remove_ellipsis(inp, tuple_val);
}

if (need_expand_bool_dim) {
py::object shape = getattr(inp, "shape");
if (shape.ptr() != Py_None) {
py::tuple ret = _expand_bool_dim(inp, tuple_val);
inp = ret[0];
tuple_val = ret[1];
}
}

py::list items;
py::list tensors;
int cur_axis = -1;

for (size_t i = 0; i < tuple_val.size(); ++i) {
py::object handle = tuple_val[i];
cur_axis++;
if (!is_scalar(handle.ptr()) && !PySlice_Check(handle.ptr())) {
use_subtensor = false;
}
py::list item;
item.append(cur_axis);
auto push = [&](PyObject* v) {
if (v == Py_None) {
item.append(false);
} else {
item.append(true);
tensors.append(_get_index(py::reinterpret_borrow<py::object>(v), inp));
}
};

if (PySlice_Check(handle.ptr())) {
PySliceObject* s = (PySliceObject*)handle.ptr();
if (s->start == Py_None && s->stop == Py_None && s->step == Py_None) {
continue;
}
push(s->start);
push(s->stop);
push(s->step);
item.append(false);
} else {
for (size_t j = 0; j < 3; j++)
item.append(false);
push(handle.ptr());
}
items.append(item);
}

return py::make_tuple(inp, tensors, items, use_subtensor, need_expand_bool_dim);
}

py::object _getitem_cpp(py::handle inp_hdl, py::handle idx_hdl) {
py::tuple try_res = _try_cond_take(inp_hdl, idx_hdl);
if (try_res.size() == 2) {
return try_res[0];
}
py::tuple up = _unpack_indexes(inp_hdl, idx_hdl);
py::object tensor = py::reinterpret_borrow<py::object>(up[0]);
py::list tensors = py::reinterpret_borrow<py::list>(up[1]);
py::list py_items = py::reinterpret_borrow<py::list>(up[2]);
std::vector<std::tuple<int8_t, bool, bool, bool, bool>> cpp_items;
for (size_t i = 0; i < py_items.size(); ++i) {
py::list item = py::reinterpret_borrow<py::list>(py_items[i]);
cpp_items.push_back(
{item[0].cast<int8_t>(), item[1].cast<bool>(), item[2].cast<bool>(),
item[3].cast<bool>(), item[4].cast<bool>()});
}
static std::shared_ptr<OpDef> op;
if (up[3].cast<bool>()) {
op = Subtensor::make(cpp_items);
} else {
op = IndexingMultiAxisVec::make(cpp_items);
}
std::vector<PyObject*> p;
p.resize(tensors.size() + 2);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
p[i + 2] = tensors[i].ptr();
}
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret[0];
}

py::object _setitem_cpp(py::handle inp_hdl, py::handle idx_hdl, py::handle val_hdl) {
py::object org_shape = getattr(inp_hdl, "shape");
py::object val = py::reinterpret_borrow<py::object>(val_hdl);
if (!TensorWrapper::try_cast(val.ptr()) && !py::isinstance<PySymbolVar>(val)) {
val =
_Const(val_hdl, getattr(inp_hdl, "dtype"), getattr(inp_hdl, "device"),
inp_hdl);
}

py::tuple up = _unpack_indexes(inp_hdl, idx_hdl);
py::object tensor = py::reinterpret_borrow<py::object>(up[0]);
py::list tensors = py::reinterpret_borrow<py::list>(up[1]);
py::list py_items = py::reinterpret_borrow<py::list>(up[2]);
std::vector<std::tuple<int8_t, bool, bool, bool, bool>> cpp_items;
for (size_t i = 0; i < py_items.size(); ++i) {
py::list item = py::reinterpret_borrow<py::list>(py_items[i]);
cpp_items.push_back(
{item[0].cast<int8_t>(), item[1].cast<bool>(), item[2].cast<bool>(),
item[3].cast<bool>(), item[4].cast<bool>()});
}
static std::shared_ptr<OpDef> op, set_op;
if (up[3].cast<bool>()) {
op = Subtensor::make(cpp_items);
} else {
op = IndexingMultiAxisVec::make(cpp_items);
}
std::vector<PyObject*> p;
p.resize(tensors.size() + 2);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
p[i + 2] = tensors[i].ptr();
}
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
py::object tmp_result = ret[0];

try {
py::object value_tuple_shape = val.attr("_tuple_shape");
py::object tmp_result_tuple_shape = tmp_result.attr("_tuple_shape");
py::tuple value_shape = py::reinterpret_borrow<py::tuple>(value_tuple_shape);
py::tuple tmp_result_shape =
py::reinterpret_borrow<py::tuple>(tmp_result_tuple_shape);
for (size_t i = 0; i < value_shape.size() && i < tmp_result_shape.size(); ++i) {
size_t vs = value_shape[value_shape.size() - i - 1].cast<size_t>();
size_t ts =
tmp_result_shape[tmp_result_shape.size() - i - 1].cast<size_t>();
if (vs != 1 && vs != ts) {
std::string lhs = "", rhs = "";
for (size_t j = 0; j < tmp_result_shape.size(); ++j) {
lhs += std::to_string(tmp_result_shape[j].cast<size_t>());
if (j)
lhs += ",";
}
for (size_t j = 0; j < value_shape.size(); ++j) {
rhs += std::to_string(value_shape[j].cast<size_t>());
if (j)
rhs += ",";
}
throw py::value_error(
"cannot copy tensor with shape (" + rhs +
") to subtensor with shape (" + lhs + ")");
}
}
} catch (py::error_already_set& err) {
;
}

py::object broadcast_func = getattr(val, "_broadcast");
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, getattr(tmp_result, "shape").release().ptr());
PyObject* new_val = PyObject_CallObject(broadcast_func.ptr(), Args);
Py_XDECREF(Args);
val = py::reinterpret_steal<py::object>(new_val);

if (up[3].cast<bool>()) {
set_op = SetSubtensor::make(cpp_items);
} else {
set_op = IndexingSetMultiAxisVec::make(cpp_items);
}

std::vector<PyObject*> q;
q.resize(tensors.size() + 3);
py::object Set_Op = py::cast(set_op);
q[0] = Set_Op.ptr();
q[1] = tensor.ptr();
q[2] = val.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
q[i + 3] = tensors[i].ptr();
}
py::tuple result =
py::reinterpret_steal<py::object>(py_apply(NULL, q.data(), q.size()));
py::object res = result[0];

if (up[4].cast<bool>()) {
py::object reshape_func = getattr(res, "reshape");
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, org_shape.release().ptr());
PyObject* new_tensor = PyObject_CallObject(reshape_func.ptr(), Args);
Py_XDECREF(Args);
res = py::reinterpret_steal<py::object>(new_tensor);
}

return res;
}

// Returns the dtype that would result from performing an arithmetic // Returns the dtype that would result from performing an arithmetic
// operation on the provided input tensors and scalars. // operation on the provided input tensors and scalars.
PyObject* dtype_promotion(PyObject* self, PyObject* const* args, size_t nargs) { PyObject* dtype_promotion(PyObject* self, PyObject* const* args, size_t nargs) {
@@ -1126,30 +576,6 @@ PyObject* get_device(PyObject* self, PyObject* const* args, size_t nargs) {
PYEXT17_TRANSLATE_EXC_RET(nullptr) PYEXT17_TRANSLATE_EXC_RET(nullptr)
} }


PyObject* make_shape_tuple(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _make_shape_tuple(py::handle(args[0])).release().ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

PyObject* getitem_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _getitem_cpp(py::handle(args[0]), py::handle(args[1])).release().ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

PyObject* setitem_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _setitem_cpp(
py::handle(args[0]), py::handle(args[1]), py::handle(args[2]))
.release()
.ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

#ifdef METH_FASTCALL #ifdef METH_FASTCALL
#define MGE_PY_INTERFACE(NAME, FUNC) \ #define MGE_PY_INTERFACE(NAME, FUNC) \
{ #NAME, (PyCFunction)FUNC, METH_FASTCALL, nullptr } { #NAME, (PyCFunction)FUNC, METH_FASTCALL, nullptr }


+ 2
- 0
imperative/python/src/tensor.h View File

@@ -38,6 +38,8 @@ namespace mgb::imperative::python {


extern interpreter::Interpreter::Channel* interpreter_for_py; extern interpreter::Interpreter::Channel* interpreter_for_py;
extern PyTypeObject* py_tensor_type; extern PyTypeObject* py_tensor_type;
extern PyObject* cpp_use_symbolic_shape;
extern PyObject* cpp_astensor1d;


struct Tensor : NonCopyableObj { struct Tensor : NonCopyableObj {
private: private:


+ 630
- 0
imperative/python/src/tensor_utils.cpp View File

@@ -0,0 +1,630 @@
/**
* \file imperative/python/src/tensor.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "megbrain/common.h"
#include "megbrain/dtype.h"
#include "megbrain/imperative/ops/autogen.h"
#include "megbrain/imperative/ops/backward_graph.h"
#include "megbrain/imperative/ops/utility.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/imperative/transformations/eval.h"
#include "megbrain/imperative/transformations/lazy.h"
#include "megbrain/imperative/transformations/scalar.h"
#include "megbrain/imperative/transformations/symbol.h"
#include "megbrain/imperative/transformations/trace.h"
#include "megbrain/imperative/utils/map.h"
#include "megbrain/imperative/utils/stats.h"
#include "megbrain/opr/io.h"
#include "megbrain/plugin/profiler.h"

#include "./common.h"
#include "./grad.h"
#include "./graph_rt.h"
#include "./helper.h"
#include "./module_trace.h"
#include "./numpy_dtypes.h"
#include "./tensor.h"
#include "./tensor_utils.h"
#include "./transformation.h"

#include <object.h>
#include <pybind11/numpy.h>
#include <pybind11/operators.h>
#include <pybind11/pytypes.h>
#include <pyerrors.h>
#include <range/v3/all.hpp>
#include <string>

#include <unordered_map>

#include "../../src/impl/mgb_cg_impl.h"

namespace py = pybind11;
namespace views = ranges::views;

namespace mgb::imperative::python {

bool is_scalar(PyObject* tensor) {
if (py::isinstance<PySymbolVar>(py::handle(tensor))) {
auto var = py::handle(tensor).cast<PySymbolVar*>();
return var->is_scalar;
}
auto* tw = TensorWrapper::try_cast(tensor);
if (tw) {
return tw->m_tensor->is_scalar();
}
return PyArray_CheckAnyScalar(tensor);
}

bool is_bool_list(PyObject* arg) {
if (!PyList_Check(arg)) {
return false;
}
size_t sz = PyList_Size(arg);
if (!sz) {
return false;
}
for (size_t i = 0; i < sz; ++i) {
PyObject* handle = PyList_GetItem(arg, i);
if (!PyBool_Check(handle)) {
return false;
}
}
return true;
}

bool is_bool_dtype(PyObject* args) {
if (!PyObject_HasAttrString(args, "dtype"))
return false;
PyObject* dobj = PyObject_GetAttrString(args, "dtype");
PyArray_Descr* dtype;
PyArray_DescrConverter(dobj, &dtype);
bool ret = (dtype->kind == 'b');
Py_XDECREF(dtype);
Py_XDECREF(dobj);
return ret;
}

py::object _Const(
py::handle value, py::handle dtype, py::handle device, py::handle ref) {
py::object val = py::reinterpret_borrow<py::object>(value);
if (PyArray_Check(value.ptr())) {
py::tuple strides =
py::reinterpret_borrow<py::tuple>(getattr(value, "strides"));
bool need_squeeze = false;
for (size_t i = 0; i < strides.size(); ++i) {
if (strides[i].cast<ptrdiff_t>() == 0) {
need_squeeze = true;
}
}
if (need_squeeze) {
val = py::reinterpret_borrow<py::array>(value);
val = val.attr("squeeze")();
val = val.attr("reshape")(val.attr("shape"));
}
}
if (py::isinstance<PySymbolVar>(ref)) {
auto ref_var = ref.cast<PySymbolVar*>();
auto* graph = ref_var->m_node->owner_graph();
auto cn = device.cast<CompNode>();
OperatorNodeConfig config(cn);
auto hv = npy::np2tensor(
val.ptr(), npy::Meth::borrow(cn), dtype.cast<mgb::DType>());
auto typeobj = ref.get_type();
return typeobj(opr::ImmutableTensor::make(*graph, hv, config).node());
}
py::tuple tup = py::make_tuple(val, dtype, device, true, false, py::none());
return TensorWrapper::make(py_tensor_type, tup.ptr(), nullptr);
}

py::tuple _make_shape_tuple(py::handle shape) {
py::list orig;
py::list ret(0);
auto solve_one = [&](py::handle val) {
if (TensorWrapper::try_cast(val.ptr()) || py::isinstance<PySymbolVar>(val)) {
py::object np = getattr(val, "numpy")();
PyArrayObject* arr = (PyArrayObject*)np.ptr();
PyObject* maybe_list = PyArray_ToList(arr);
if (PyList_Check(maybe_list)) {
py::list may = py::reinterpret_steal<py::list>(maybe_list);
for (size_t i = 0; i < may.size(); ++i) {
ret.append(may[i]);
}
} else {
mgb_assert(PyLong_Check(maybe_list));
ret.append(PyLong_AsLong(maybe_list));
Py_XDECREF(maybe_list);
}
} else if (PyArray_Check(val.ptr())) {
ret.append(PyArray_PyIntAsInt(val.ptr()));
} else {
ret.append(PyLong_AsLong(val.ptr()));
}
};
if (PyArray_Check(shape.ptr()) && !PyArray_CheckAnyScalar(shape.ptr())) {
orig = py::reinterpret_steal<py::list>(
PyArray_ToList((PyArrayObject*)shape.ptr()));
for (size_t i = 0; i < orig.size(); ++i) {
solve_one(orig[i]);
}
} else if (PyList_Check(shape.ptr())) {
orig = py::reinterpret_borrow<py::list>(shape);
for (size_t i = 0; i < orig.size(); ++i) {
solve_one(orig[i]);
}
} else if (PyTuple_Check(shape.ptr())) {
py::tuple tup = py::reinterpret_borrow<py::tuple>(shape);
for (size_t i = 0; i < tup.size(); ++i) {
solve_one(tup[i]);
}
} else {
solve_one(shape);
}
return py::reinterpret_steal<py::tuple>(PyList_AsTuple(ret.ptr()));
}

py::object _get_index(py::object tensor, py::object src) {
if (!TensorWrapper::try_cast(tensor.ptr()) &&
!py::isinstance<PySymbolVar>(tensor)) {
auto get_const = [&](mgb::DType dtype) -> py::object {
return _Const(tensor, py::cast(dtype), src.attr("device"), src);
};
if (is_bool_list(tensor.ptr()) || is_bool_dtype(tensor.ptr())) {
tensor = get_const(dtype::Bool());
} else {
tensor = get_const(dtype::Int32());
}
if (!is_bool_dtype(tensor.ptr())) {
return tensor;
}
} else {
if (!is_bool_dtype(tensor.ptr())) {
return tensor;
}
}
static std::shared_ptr<OpDef> op = CondTake::make();
std::vector<PyObject*> p;
p.resize(3);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
p[2] = tensor.ptr();
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret[1];
}

py::tuple _try_cond_take(py::handle tensor, py::handle index) {
if (!hasattr(index, "dtype") || !hasattr(index, "shape")) {
return py::tuple();
}
if (!is_bool_dtype(index.ptr()) ||
_make_shape_tuple(getattr(index, "shape"))
.not_equal(_make_shape_tuple(getattr(tensor, "shape")))) {
return py::tuple();
}
py::object iobj;
if (PyArray_Check(index.ptr())) {
iobj =
_Const(index, py::cast((mgb::DType)dtype::Bool()),
getattr(tensor, "device"), tensor);
} else {
iobj = py::reinterpret_borrow<py::object>(index);
}
static std::shared_ptr<OpDef> op = CondTake::make();
std::vector<PyObject*> p;
p.resize(3);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
p[2] = iobj.ptr();
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret;
}

py::tuple _remove_ellipsis(py::object tensor, py::tuple tuple_val) {
size_t tuple_size = tuple_val.size();
size_t ndim_sum = 0, cur_sum = 0;
int pos = -1;
bool has_unknown_ndim_bool_index = false;
for (size_t i = 0; i < tuple_size; ++i) {
py::object handle = tuple_val[i];
if (handle.ptr() == Py_Ellipsis) {
pos = static_cast<int>(i);
for (size_t j = 0; j < i; ++j) {
py::object t = tuple_val[j];
if (t.ptr() == Py_Ellipsis) {
throw py::index_error("only one ellipsis is allowed.");
}
}
} else {
size_t ndim_incr = 1;
if (hasattr(handle, "dtype") && is_bool_dtype(handle.ptr()) &&
hasattr(handle, "ndim")) {
py::object ndim = getattr(handle, "ndim");
if (PyLong_Check(ndim.ptr())) {
ndim_incr = PyLong_AsLong(ndim.ptr());
} else {
has_unknown_ndim_bool_index = true;
}
}
cur_sum += ndim_incr;
}
}
if (pos == -1) {
return tuple_val;
} else {
if (has_unknown_ndim_bool_index) {
throw py::index_error(
"does not support bool index with unknown shape when using "
"Ellipsis.");
}
try {
ndim_sum = getattr(tensor, "ndim").cast<size_t>();
} catch (py::error_already_set& err) {
throw py::index_error(
"does not support Ellipsis when tensor's ndim is unknown.");
}
py::tuple ret(ndim_sum - cur_sum + tuple_size - 1);
size_t idx = 0;
for (size_t i = 0; i < tuple_size; ++i) {
if (i == pos) {
for (size_t j = cur_sum; j < ndim_sum; ++j) {
ret[idx++] = PySlice_New(NULL, NULL, NULL);
}
} else {
ret[idx++] = tuple_val[i];
}
}
return ret;
}
}

py::tuple _expand_bool_dim(py::object tensor, py::tuple tuple_val) {
py::tuple cur_shape = _make_shape_tuple(py::handle(getattr(tensor, "shape")));
py::list new_tuple_val(0);

size_t offset = 0;
size_t tdim = 0;
for (size_t i = 0; i < tuple_val.size(); ++i) {
py::handle k = tuple_val[i];
if (is_bool_dtype(k.ptr())) {
size_t ndim = getattr(k, "ndim").cast<size_t>();
if (ndim > 1) {
py::tuple ishape = _make_shape_tuple(py::handle(getattr(k, "shape")));
for (size_t j = 0; j < ndim; ++j) {
if (cur_shape[tdim + j - offset].cast<size_t>() !=
ishape[j].cast<size_t>()) {
std::string msg =
"boolean index did not match tensor along dimension " +
std::to_string(tdim + j) + "; dimension is " +
std::to_string(
cur_shape[tdim + j - offset].cast<size_t>()) +
" but corresponding boolean dimension is " +
std::to_string(ishape[j].cast<size_t>());
throw py::index_error(msg.c_str());
}
}
py::object new_k = getattr(k, "reshape")(-1);
py::object kshape = getattr(new_k, "shape");
py::list new_shape(0);
PyObject* sym = PyObject_CallObject(cpp_use_symbolic_shape, nullptr);
bool is_sym = (sym == Py_True);
Py_XDECREF(sym);
if (is_sym) {
py::object tshape = getattr(tensor, "shape");
for (size_t j = 0; j < i; ++j) {
new_shape.append(tshape[py::int_(j)]);
}
new_shape.append(kshape[py::int_(0)]);
for (size_t j = tdim + ndim - offset; j < cur_shape.size(); ++j) {
new_shape.append(cur_shape[j]);
}
py::tuple args = py::make_tuple(new_shape);
PyObject* shape_tensor =
PyObject_CallObject(cpp_astensor1d, args.ptr());
py::object reshape_func = getattr(tensor, "reshape");
Py_INCREF(shape_tensor);
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, shape_tensor);
PyObject* new_tensor =
PyObject_CallObject(reshape_func.ptr(), Args);
Py_XDECREF(Args);
tensor = py::reinterpret_steal<py::object>(new_tensor);
cur_shape = _make_shape_tuple(py::handle(shape_tensor));
Py_XDECREF(shape_tensor);
} else {
for (size_t j = 0; j < i; ++j) {
new_shape.append(cur_shape[j]);
}
new_shape.append(py::reinterpret_borrow<py::tuple>(kshape)[0]);
for (size_t j = tdim + ndim - offset; j < cur_shape.size(); ++j) {
new_shape.append(cur_shape[j]);
}
cur_shape = new_shape;
tensor = getattr(tensor, "reshape")(cur_shape);
}
offset++;
tdim += ndim;
}
new_tuple_val.append(k);
} else {
new_tuple_val.append(k);
tdim++;
}
}
return py::make_tuple(tensor, py::reinterpret_borrow<py::tuple>(new_tuple_val));
}

py::tuple _unpack_indexes(py::handle inp_hdl, py::handle idx_hdl) {
py::object inp = py::reinterpret_borrow<py::object>(inp_hdl);
py::tuple tuple_val;
if (py::isinstance<py::tuple>(idx_hdl)) {
tuple_val = py::reinterpret_borrow<py::tuple>(idx_hdl);
} else {
tuple_val = py::make_tuple(idx_hdl);
}

bool use_subtensor = true;
bool need_remove_ellipsis = false;
bool need_expand_bool_dim = false;
size_t idx_ndim = 0;
for (size_t i = 0; i < tuple_val.size(); ++i) {
py::object k = tuple_val[i];
if (k.ptr() == Py_None) {
throw py::index_error("newaxis is not allowed here");
} else if (k.ptr() == Py_Ellipsis) {
need_remove_ellipsis = true;
} else {
if (is_bool_dtype(k.ptr()) && hasattr(k, "ndim")) {
size_t ndim = getattr(k, "ndim").cast<size_t>();
idx_ndim += ndim;
if (ndim > 1) {
need_expand_bool_dim = true;
}
} else {
idx_ndim++;
}
}
}
try {
size_t inp_ndim = getattr(inp, "ndim").cast<size_t>();
if (idx_ndim > inp_ndim) {
std::string msg = "too many indices for tensor: tensor is " +
std::to_string(inp_ndim) + "-dimensional, but " +
std::to_string(idx_ndim) + " were indexed";
throw py::index_error(msg.c_str());
}
} catch (py::error_already_set& err) {
; // ignore
}
if (need_remove_ellipsis) {
tuple_val = _remove_ellipsis(inp, tuple_val);
}

if (need_expand_bool_dim) {
py::object shape = getattr(inp, "shape");
if (shape.ptr() != Py_None) {
py::tuple ret = _expand_bool_dim(inp, tuple_val);
inp = ret[0];
tuple_val = ret[1];
}
}

py::list items;
py::list tensors;
int cur_axis = -1;

for (size_t i = 0; i < tuple_val.size(); ++i) {
py::object handle = tuple_val[i];
cur_axis++;
if (!is_scalar(handle.ptr()) && !PySlice_Check(handle.ptr())) {
use_subtensor = false;
}
py::list item;
item.append(cur_axis);
auto push = [&](PyObject* v) {
if (v == Py_None) {
item.append(false);
} else {
item.append(true);
tensors.append(_get_index(py::reinterpret_borrow<py::object>(v), inp));
}
};

if (PySlice_Check(handle.ptr())) {
PySliceObject* s = (PySliceObject*)handle.ptr();
if (s->start == Py_None && s->stop == Py_None && s->step == Py_None) {
continue;
}
push(s->start);
push(s->stop);
push(s->step);
item.append(false);
} else {
for (size_t j = 0; j < 3; j++)
item.append(false);
push(handle.ptr());
}
items.append(item);
}

return py::make_tuple(inp, tensors, items, use_subtensor, need_expand_bool_dim);
}

py::object _getitem_cpp(py::handle inp_hdl, py::handle idx_hdl) {
py::tuple try_res = _try_cond_take(inp_hdl, idx_hdl);
if (try_res.size() == 2) {
return try_res[0];
}
py::tuple up = _unpack_indexes(inp_hdl, idx_hdl);
py::object tensor = py::reinterpret_borrow<py::object>(up[0]);
py::list tensors = py::reinterpret_borrow<py::list>(up[1]);
py::list py_items = py::reinterpret_borrow<py::list>(up[2]);
std::vector<std::tuple<int8_t, bool, bool, bool, bool>> cpp_items;
for (size_t i = 0; i < py_items.size(); ++i) {
py::list item = py::reinterpret_borrow<py::list>(py_items[i]);
cpp_items.push_back(
{item[0].cast<int8_t>(), item[1].cast<bool>(), item[2].cast<bool>(),
item[3].cast<bool>(), item[4].cast<bool>()});
}
static std::shared_ptr<OpDef> op;
if (up[3].cast<bool>()) {
op = Subtensor::make(cpp_items);
} else {
op = IndexingMultiAxisVec::make(cpp_items);
}
std::vector<PyObject*> p;
p.resize(tensors.size() + 2);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
p[i + 2] = tensors[i].ptr();
}
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
return ret[0];
}

py::object _setitem_cpp(py::handle inp_hdl, py::handle idx_hdl, py::handle val_hdl) {
py::object org_shape = getattr(inp_hdl, "shape");
py::object val = py::reinterpret_borrow<py::object>(val_hdl);
if (!TensorWrapper::try_cast(val.ptr()) && !py::isinstance<PySymbolVar>(val)) {
val =
_Const(val_hdl, getattr(inp_hdl, "dtype"), getattr(inp_hdl, "device"),
inp_hdl);
}

py::tuple up = _unpack_indexes(inp_hdl, idx_hdl);
py::object tensor = py::reinterpret_borrow<py::object>(up[0]);
py::list tensors = py::reinterpret_borrow<py::list>(up[1]);
py::list py_items = py::reinterpret_borrow<py::list>(up[2]);
std::vector<std::tuple<int8_t, bool, bool, bool, bool>> cpp_items;
for (size_t i = 0; i < py_items.size(); ++i) {
py::list item = py::reinterpret_borrow<py::list>(py_items[i]);
cpp_items.push_back(
{item[0].cast<int8_t>(), item[1].cast<bool>(), item[2].cast<bool>(),
item[3].cast<bool>(), item[4].cast<bool>()});
}
static std::shared_ptr<OpDef> op, set_op;
if (up[3].cast<bool>()) {
op = Subtensor::make(cpp_items);
} else {
op = IndexingMultiAxisVec::make(cpp_items);
}
std::vector<PyObject*> p;
p.resize(tensors.size() + 2);
py::object Op = py::cast(op);
p[0] = Op.ptr();
p[1] = tensor.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
p[i + 2] = tensors[i].ptr();
}
py::tuple ret =
py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
py::object tmp_result = ret[0];

try {
py::object value_tuple_shape = val.attr("_tuple_shape");
py::object tmp_result_tuple_shape = tmp_result.attr("_tuple_shape");
py::tuple value_shape = py::reinterpret_borrow<py::tuple>(value_tuple_shape);
py::tuple tmp_result_shape =
py::reinterpret_borrow<py::tuple>(tmp_result_tuple_shape);
for (size_t i = 0; i < value_shape.size() && i < tmp_result_shape.size(); ++i) {
size_t vs = value_shape[value_shape.size() - i - 1].cast<size_t>();
size_t ts =
tmp_result_shape[tmp_result_shape.size() - i - 1].cast<size_t>();
if (vs != 1 && vs != ts) {
std::string lhs = "", rhs = "";
for (size_t j = 0; j < tmp_result_shape.size(); ++j) {
lhs += std::to_string(tmp_result_shape[j].cast<size_t>());
if (j)
lhs += ",";
}
for (size_t j = 0; j < value_shape.size(); ++j) {
rhs += std::to_string(value_shape[j].cast<size_t>());
if (j)
rhs += ",";
}
throw py::value_error(
"cannot copy tensor with shape (" + rhs +
") to subtensor with shape (" + lhs + ")");
}
}
} catch (py::error_already_set& err) {
;
}

py::object broadcast_func = getattr(val, "_broadcast");
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, getattr(tmp_result, "shape").release().ptr());
PyObject* new_val = PyObject_CallObject(broadcast_func.ptr(), Args);
Py_XDECREF(Args);
val = py::reinterpret_steal<py::object>(new_val);

if (up[3].cast<bool>()) {
set_op = SetSubtensor::make(cpp_items);
} else {
set_op = IndexingSetMultiAxisVec::make(cpp_items);
}

std::vector<PyObject*> q;
q.resize(tensors.size() + 3);
py::object Set_Op = py::cast(set_op);
q[0] = Set_Op.ptr();
q[1] = tensor.ptr();
q[2] = val.ptr();
for (size_t i = 0; i < tensors.size(); ++i) {
q[i + 3] = tensors[i].ptr();
}
py::tuple result =
py::reinterpret_steal<py::object>(py_apply(NULL, q.data(), q.size()));
py::object res = result[0];

if (up[4].cast<bool>()) {
py::object reshape_func = getattr(res, "reshape");
PyObject* Args = PyTuple_New(1);
PyTuple_SetItem(Args, 0, org_shape.release().ptr());
PyObject* new_tensor = PyObject_CallObject(reshape_func.ptr(), Args);
Py_XDECREF(Args);
res = py::reinterpret_steal<py::object>(new_tensor);
}

return res;
}

PyObject* make_shape_tuple(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _make_shape_tuple(py::handle(args[0])).release().ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

PyObject* getitem_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _getitem_cpp(py::handle(args[0]), py::handle(args[1])).release().ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

PyObject* setitem_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
try {
return _setitem_cpp(
py::handle(args[0]), py::handle(args[1]), py::handle(args[2]))
.release()
.ptr();
}
PYEXT17_TRANSLATE_EXC_RET(nullptr)
}

} // namespace mgb::imperative::python

+ 11
- 0
imperative/python/src/tensor_utils.h View File

@@ -0,0 +1,11 @@
#pragma once

namespace mgb::imperative::python {

PyObject* make_shape_tuple(PyObject* self, PyObject* const* args, size_t nargs);

PyObject* getitem_cpp(PyObject* self, PyObject* const* args, size_t nargs);

PyObject* setitem_cpp(PyObject* self, PyObject* const* args, size_t nargs);

} // namespace mgb::imperative::python

+ 13
- 3
imperative/src/impl/interpreter/interpreter_impl.cpp View File

@@ -642,7 +642,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
m_dtr.update_used_time(dest); m_dtr.update_used_time(dest);
MGB_RECORD_EVENT( MGB_RECORD_EVENT(
TensorProduceEvent, dest->id, ptr->layout(), ptr->comp_node(), TensorProduceEvent, dest->id, ptr->layout(), ptr->comp_node(),
ptr->dev_tensor().raw_ptr());
ptr->dev_tensor(false).raw_ptr());
// update tensor desc for static infer // update tensor desc for static infer
if (dest->desc.layout.ndim) { if (dest->desc.layout.ndim) {
mgb_assert( mgb_assert(
@@ -730,10 +730,20 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
inputs, apply_functor, const_functor); inputs, apply_functor, const_functor);
return outputs; return outputs;
} }
return OpDef::apply_on_physical_tensor(def, inputs, output_descs, validated);
// Check Input Layout
// Get the input layout constraints, and if the constraint is not satisfied
// inplace update the layout and blob to make the tensor contiguous
auto&& constraints = OpDef::get_input_layout_constraint(def, inputs);
for (size_t idx = 0; idx < inputs.size(); ++idx) {
auto&& layout_checker = constraints[idx];
if (layout_checker) {
inputs[idx]->to_contiguous_inplace(layout_checker);
}
}
return OpDef::apply_on_physical_tensor(
def, std::move(inputs), output_descs, validated);
}; };
MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason); MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason);
// Begin profiling operator
SmallVector<std::pair<CompNode, uint64_t>> kernels; SmallVector<std::pair<CompNode, uint64_t>> kernels;
if (profiling_device) { if (profiling_device) {
// Collecting devices // Collecting devices


+ 1
- 0
imperative/src/impl/mgb_cg_impl.h View File

@@ -1 +1,2 @@
#include "../../../src/core/impl/graph/cg_impl.h" #include "../../../src/core/impl/graph/cg_impl.h"
#include "../../../src/core/impl/graph/var_node_mem_mgr.h"

+ 5
- 0
imperative/src/impl/op_def.cpp View File

@@ -60,6 +60,11 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> OpDef::infer_output_attrs_falli
return def.trait()->infer_output_attrs_fallible(def, inputs); return def.trait()->infer_output_attrs_fallible(def, inputs);
} }


SmallVector<VarNode::LayoutConstraintCallback> OpDef::get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
return def.trait()->get_input_layout_constraint(def, inputs);
}

EncodedSubgraph OpDef::make_backward_graph( EncodedSubgraph OpDef::make_backward_graph(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs, const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs,
const SmallVector<bool>& input_requires_grad, const SmallVector<bool>& input_requires_grad,


+ 8
- 0
imperative/src/impl/op_trait.cpp View File

@@ -47,6 +47,10 @@ void OpMethFallbackByProxyGraph::impl(
InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible) { InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible) {
func.Base::operator=(proxy_graph_detail::infer_output_attrs_fallible); func.Base::operator=(proxy_graph_detail::infer_output_attrs_fallible);
} }
void OpMethFallbackByProxyGraph::impl(
GetInputLayoutConstraint& func, op_meth_tag::GetInputLayoutConstraint) {
func.Base::operator=(proxy_graph_detail::get_input_layout_constraint);
}
void OpMethFallbackByProxyGraph::impl(GradMaker& func, op_meth_tag::GradMaker) { void OpMethFallbackByProxyGraph::impl(GradMaker& func, op_meth_tag::GradMaker) {
func.Base::operator=(proxy_graph_detail::make_backward_graph); func.Base::operator=(proxy_graph_detail::make_backward_graph);
} }
@@ -63,6 +67,10 @@ void OpMethFallbackFromSubgraph::impl(
InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible) { InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible) {
func.Base::operator=(subgraph_detail::infer_output_attrs_fallible); func.Base::operator=(subgraph_detail::infer_output_attrs_fallible);
} }
void OpMethFallbackFromSubgraph::impl(
GetInputLayoutConstraint& func, op_meth_tag::GetInputLayoutConstraint) {
func.Base::operator=(subgraph_detail::get_input_layout_constraint);
}
void OpMethFallbackFromSubgraph::impl(GradMaker& func, op_meth_tag::GradMaker) { void OpMethFallbackFromSubgraph::impl(GradMaker& func, op_meth_tag::GradMaker) {
func.Base::operator=(subgraph_detail::make_backward_graph); func.Base::operator=(subgraph_detail::make_backward_graph);
} }


+ 9
- 0
imperative/src/impl/op_trait.h View File

@@ -73,6 +73,9 @@ OpMethType(ApplyOnVarNode,
OpMethType(InferOutputAttrsFallible, OpMethType(InferOutputAttrsFallible,
decltype(OpDef::infer_output_attrs_fallible)); decltype(OpDef::infer_output_attrs_fallible));


OpMethType(GetInputLayoutConstraint,
decltype(OpDef::get_input_layout_constraint));

OpMethType(GradMaker, OpMethType(GradMaker,
decltype(OpDef::make_backward_graph)); decltype(OpDef::make_backward_graph));


@@ -119,6 +122,8 @@ struct OpMethFallbackByProxyGraph : OpMethImplBase {
static void impl(ApplyOnPhysicalTensor& func, op_meth_tag::ApplyOnPhysicalTensor); static void impl(ApplyOnPhysicalTensor& func, op_meth_tag::ApplyOnPhysicalTensor);
static void impl( static void impl(
InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible); InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible);
static void impl(
GetInputLayoutConstraint& func, op_meth_tag::GetInputLayoutConstraint);
static void impl(GradMaker& func, op_meth_tag::GradMaker); static void impl(GradMaker& func, op_meth_tag::GradMaker);
}; };


@@ -128,6 +133,8 @@ struct OpMethFallbackFromSubgraph : OpMethImplBase {
static void impl(ApplyOnVarNode& func, op_meth_tag::ApplyOnVarNode); static void impl(ApplyOnVarNode& func, op_meth_tag::ApplyOnVarNode);
static void impl( static void impl(
InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible); InferOutputAttrsFallible& func, op_meth_tag::InferOutputAttrsFallible);
static void impl(
GetInputLayoutConstraint& func, op_meth_tag::GetInputLayoutConstraint);
static void impl(GradMaker& func, op_meth_tag::GradMaker); static void impl(GradMaker& func, op_meth_tag::GradMaker);
}; };


@@ -179,6 +186,7 @@ struct OpTrait {
ApplyOnDeviceTensorND apply_on_device_tensornd; ApplyOnDeviceTensorND apply_on_device_tensornd;
ApplyOnVarNode apply_on_var_node; ApplyOnVarNode apply_on_var_node;
InferOutputAttrsFallible infer_output_attrs_fallible; InferOutputAttrsFallible infer_output_attrs_fallible;
GetInputLayoutConstraint get_input_layout_constraint;
GradMaker make_backward_graph; GradMaker make_backward_graph;
Props props; Props props;
HashFunc hash; HashFunc hash;
@@ -199,6 +207,7 @@ struct OpTrait {
cb(apply_on_device_tensornd) \ cb(apply_on_device_tensornd) \
cb(apply_on_var_node) \ cb(apply_on_var_node) \
cb(infer_output_attrs_fallible) \ cb(infer_output_attrs_fallible) \
cb(get_input_layout_constraint) \
cb(make_backward_graph) \ cb(make_backward_graph) \
cb(props) \ cb(props) \
cb(hash) \ cb(hash) \


+ 2
- 2
imperative/src/impl/opr_utility.cpp View File

@@ -117,7 +117,7 @@ void InputCallback::scn_do_execute() {
layout.init_contiguous_stride(); layout.init_contiguous_stride();
dev_tensor.reset(dev_tensor.storage(), layout); dev_tensor.reset(dev_tensor.storage(), layout);
} }
output(0)->reset_dev_tensor_from_tensor(dev_tensor);
output(0)->force_assign_dev_tensor_from_tensor(dev_tensor);
} }


cg::OperatorNodeBase* InputCallback::shallow_copy( cg::OperatorNodeBase* InputCallback::shallow_copy(
@@ -311,7 +311,7 @@ cg::OperatorNodeBase::NodeProp* MutableTensor::do_make_node_prop() const {
} }


void MutableTensor::scn_do_execute() { void MutableTensor::scn_do_execute() {
output(0)->reset_dev_tensor_from_tensor(*m_dev_tensor);
output(0)->force_assign_dev_tensor_from_tensor(*m_dev_tensor);
} }


void MutableTensor::init_output_static_infer_desc() { void MutableTensor::init_output_static_infer_desc() {


+ 39
- 27
imperative/src/impl/ops/broadcast.cpp View File

@@ -83,28 +83,18 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
SmallVector<TensorPtr> apply_on_physical_tensor( SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs, const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto& input = inputs[0];
TensorShape target_shape;
if (validated) {
target_shape = output_descs[0].layout;
} else {
cg::copy_tensor_value_to_shape(
target_shape, inputs[1]->get_value().proxy_to_default_cpu());
}
TensorPtr output = Tensor::make(
TensorLayout(target_shape, input->dtype()), input->comp_node());
if (output->layout().is_empty()) {
return {output};
}
if (input->shape().eq_shape(output->shape())) {
mgb_assert(input->layout().eq_layout(output->layout()));
output->dev_tensor().copy_from_fixlayout(input->dev_tensor());
} else {
TensorLayout input_layout = input->layout().broadcast(output->shape());
output->dev_tensor().copy_from_fixlayout(
input->dev_tensor().sub(SubTensorSpec::make_from_layout(input_layout)));
}
return {output};
def.cast_final_safe<Broadcast>();
size_t nr_inp = inputs.size();
mgb_assert(nr_inp == 2, "Broadcast expects 2 inputs; got %lu actually", nr_inp);
auto&& src = inputs[0];
auto&& tshp_nd = inputs[1];
auto slayout = src->layout();

TensorShape tshp;
cg::copy_tensor_value_to_shape(tshp, tshp_nd->get_value().proxy_to_default_cpu());
TensorLayout tlayout = slayout.broadcast(tshp);
// memory forward
return {Tensor::make(src->blob(), src->offset(), tlayout)};
} }


OP_TRAIT_REG(Broadcast, Broadcast, opr::Broadcast) OP_TRAIT_REG(Broadcast, Broadcast, opr::Broadcast)
@@ -184,10 +174,6 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto&& tshp_nd = inputs[1]; auto&& tshp_nd = inputs[1];
auto slayout = src->layout(); auto slayout = src->layout();


if (validated) {
return {Tensor::make(src->blob(), 0, output_descs[0].layout)};
}

TensorShape tshp; TensorShape tshp;
cg::copy_tensor_value_to_shape(tshp, tshp_nd->get_value().proxy_to_default_cpu()); cg::copy_tensor_value_to_shape(tshp, tshp_nd->get_value().proxy_to_default_cpu());
if (op_def.axis != opr::Reshape::Param::INVALID_AXIS) { if (op_def.axis != opr::Reshape::Param::INVALID_AXIS) {
@@ -195,13 +181,39 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
tshp[op_def.axis] = 1; tshp[op_def.axis] = 1;
tshp[op_def.axis] = src->layout().total_nr_elems() / tshp.total_nr_elems(); tshp[op_def.axis] = src->layout().total_nr_elems() / tshp.total_nr_elems();
} }
return {Tensor::make(src->blob(), 0, slayout.reshape(tshp))};
TensorLayout tlayout;
mgb_assert(slayout.try_reshape(tlayout, tshp));
return {Tensor::make(src->blob(), src->offset(), tlayout)};
}

SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
auto&& op_def = def.cast_final_safe<Reshape>();
SmallVector<VarNode::LayoutConstraintCallback> layout_checker(inputs.size());
layout_checker[0] = [&](const TensorLayout& layout) {
TensorShape tshp;
TensorLayout ret;
cg::copy_tensor_value_to_shape(
tshp, inputs[1]->get_value().proxy_to_default_cpu());
if (op_def.axis != opr::Reshape::Param::INVALID_AXIS) {
mgb_assert(tshp[op_def.axis] == -1);
tshp[op_def.axis] = 1;
tshp[op_def.axis] = layout.total_nr_elems() / tshp.total_nr_elems();
}
if (layout.try_reshape(ret, tshp)) {
return true;
} else {
return false;
}
};
return layout_checker;
} }


OP_TRAIT_REG(Reshape, Reshape) OP_TRAIT_REG(Reshape, Reshape)
.apply_on_var_node(apply_on_var_node) .apply_on_var_node(apply_on_var_node)
.infer_output_attrs_fallible(infer_output_attrs_fallible) .infer_output_attrs_fallible(infer_output_attrs_fallible)
.apply_on_physical_tensor(apply_on_physical_tensor) .apply_on_physical_tensor(apply_on_physical_tensor)
.get_input_layout_constraint(get_input_layout_constraint)
.fallback(); .fallback();
} // namespace reshape } // namespace reshape




+ 15
- 5
imperative/src/impl/ops/elemwise.cpp View File

@@ -220,12 +220,22 @@ cg::OperatorNodeBase* apply_inplace_add_on_var_node(
SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor( SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs, const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
mgb_assert(
inputs[0]->blob().use_count() == 1 && inputs[0]->blob()->storage().unique(),
"This inplace modification may change the elements of other tensors. "
"Please set MEGENGINE_INPLACE_UPDATE to 0 to ensure the program runs "
"correctly.");
auto dest = inputs[0], delta = inputs[1], alpha = inputs[2], beta = inputs[3]; auto dest = inputs[0], delta = inputs[1], alpha = inputs[2], beta = inputs[3];
if (!(inputs[0]->blob().unique() && inputs[0]->blob()->storage().unique())) {
mgb_log_warn(
"This inplace modification may change the elements of other tensors. "
"Fallback to non-inplace update.");

DeviceTensorStorage storage;
storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage());
storage = storage.sub(dest->offset());
DeviceTensorND dv;
dv.reset(storage, dest->layout());

DeviceTensorND dv_new;
dv_new.copy_from(dv);
dest = Tensor::make(dv_new);
}
auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { auto tensor_to_scalar = [](const TensorPtr& tensor) -> float {
return *tensor->get_value().ptr<float>(); return *tensor->get_value().ptr<float>();
}; };


+ 12
- 1
imperative/src/impl/ops/reduce.cpp View File

@@ -54,7 +54,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs, const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
if (memory_forward_success(def, inputs)) { if (memory_forward_success(def, inputs)) {
return {Tensor::make(inputs[0]->blob(), 0, inputs[0]->layout())};
return {Tensor::make(
inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())};
} }
return proxy_graph_detail::apply_on_physical_tensor( return proxy_graph_detail::apply_on_physical_tensor(
def, inputs, output_descs, validated); def, inputs, output_descs, validated);
@@ -73,11 +74,21 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
return {output_descs, validated}; return {output_descs, validated};
} }


SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
SmallVector<VarNode::LayoutConstraintCallback> layout_checker(inputs.size());
layout_checker[0] = [](const TensorLayout& layout) {
return layout.is_contiguous();
};
return layout_checker;
}

OP_TRAIT_REG(Reduce, Reduce, opr::Reduce) OP_TRAIT_REG(Reduce, Reduce, opr::Reduce)
.make_from_op_node(make_from_op_node) .make_from_op_node(make_from_op_node)
.apply_on_var_node(apply_on_var_node) .apply_on_var_node(apply_on_var_node)
.apply_on_physical_tensor(apply_on_physical_tensor) .apply_on_physical_tensor(apply_on_physical_tensor)
.infer_output_attrs_fallible(infer_output_attrs_fallible) .infer_output_attrs_fallible(infer_output_attrs_fallible)
.get_input_layout_constraint(get_input_layout_constraint)
.fallback(); .fallback();
} // namespace reduce } // namespace reduce
} // namespace } // namespace


+ 8
- 0
imperative/src/impl/ops/rng.cpp View File

@@ -594,6 +594,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible<Dro
return {dests, true}; return {dests, true};
} }


template <typename Op>
SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
SmallVector<VarNode::LayoutConstraintCallback> layout_checker(inputs.size());
return layout_checker;
}

} // anonymous namespace } // anonymous namespace


Handle new_handle(CompNode comp_node, uint64_t seed) { Handle new_handle(CompNode comp_node, uint64_t seed) {
@@ -622,6 +629,7 @@ CompNode get_rng_handle_compnode(Handle handle) {
.apply_on_var_node(apply_on_var_node<NAME, Output>) \ .apply_on_var_node(apply_on_var_node<NAME, Output>) \
.apply_on_physical_tensor(apply_on_physical_tensor<NAME>) \ .apply_on_physical_tensor(apply_on_physical_tensor<NAME>) \
.infer_output_attrs_fallible(infer_output_attrs_fallible<NAME>) \ .infer_output_attrs_fallible(infer_output_attrs_fallible<NAME>) \
.get_input_layout_constraint(get_input_layout_constraint<NAME>) \
.fallback(); \ .fallback(); \
} }




+ 95
- 2
imperative/src/impl/ops/specializations.cpp View File

@@ -60,9 +60,55 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
return opr::Dimshuffle::make(inputs[0], ds.pattern, 0UL, config); return opr::Dimshuffle::make(inputs[0], ds.pattern, 0UL, config);
} }


SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& ds = static_cast<const Dimshuffle&>(def);
mgb_assert(
ds.pattern.size() <= TensorShape::MAX_NDIM,
"Dimshuffle pattern exceeds max length of %zd", TensorShape::MAX_NDIM);
size_t nr_inp = inputs.size();
mgb_assert(nr_inp == 1, "Dimshuffle expects 1 inputs; got %lu actually", nr_inp);
auto&& src = inputs[0];
auto inp_layout = src->layout();
size_t pattern_ndim = *std::max_element(ds.pattern.begin(), ds.pattern.end()) + 1;
mgb_assert(
inp_layout.ndim == pattern_ndim,
"input ndim mismatch for Dimshuffle: expect=%zd actual=%zd", pattern_ndim,
inp_layout.ndim);
TensorLayout out_layout{inp_layout.dtype};
out_layout.ndim = ds.pattern.size();

size_t idx = 0;
bool input_used[TensorLayout::MAX_NDIM] = {0};
for (auto i : ds.pattern) {
if (i < 0) {
out_layout.shape[idx] = 1;
out_layout.stride[idx] = 1;
} else {
input_used[i] = true;
out_layout.shape[idx] = inp_layout.shape[i];
out_layout.stride[idx] = inp_layout.stride[i];
}
++idx;
}
if (out_layout.is_contiguous()) {
out_layout.init_contiguous_stride();
}
for (size_t i = 0; i < pattern_ndim; ++i) {
mgb_assert(
input_used[i] || inp_layout.shape[i] == 1,
"non-1 dim discarded in Dimshuffle: ishp=%s dim=%zd",
inp_layout.megdnn::TensorShape::to_string().c_str(), i);
}
// memory forward
return {Tensor::make(src->blob(), src->offset(), out_layout)};
}

OP_TRAIT_REG(Dimshuffle, Dimshuffle, opr::Dimshuffle) OP_TRAIT_REG(Dimshuffle, Dimshuffle, opr::Dimshuffle)
.make_from_op_node(make_from_op_node) .make_from_op_node(make_from_op_node)
.apply_on_var_node(apply_on_var_node) .apply_on_var_node(apply_on_var_node)
.apply_on_physical_tensor(apply_on_physical_tensor)
.fallback(); .fallback();
} // namespace dimshuffle } // namespace dimshuffle
} // namespace } // namespace
@@ -80,7 +126,25 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
return opr::AxisAddRemove::make(inputs[0], param, config); return opr::AxisAddRemove::make(inputs[0], param, config);
} }


OP_TRAIT_REG(AddAxis, AddAxis).apply_on_var_node(apply_on_var_node).fallback();
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& op_def = def.cast_final_safe<AddAxis>();
size_t nr_inp = inputs.size();
mgb_assert(nr_inp == 1, "AddAxis expects 1 inputs; got %lu actually", nr_inp);
auto&& src = inputs[0];
auto tlayout = src->layout();
for (auto&& i : op_def.axis) {
tlayout.add_axis_cont_inplace(i);
}
// memory forward
return {Tensor::make(src->blob(), src->offset(), tlayout)};
}

OP_TRAIT_REG(AddAxis, AddAxis)
.apply_on_var_node(apply_on_var_node)
.apply_on_physical_tensor(apply_on_physical_tensor)
.fallback();
} // namespace add_axis } // namespace add_axis
} // namespace } // namespace


@@ -97,7 +161,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
return opr::AxisAddRemove::make(inputs[0], param, config); return opr::AxisAddRemove::make(inputs[0], param, config);
} }


OP_TRAIT_REG(RemoveAxis, RemoveAxis).apply_on_var_node(apply_on_var_node).fallback();
SmallVector<TensorPtr> apply_on_physical_tensor(
const OpDef& def, const SmallVector<TensorPtr>& inputs,
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& op_def = def.cast_final_safe<RemoveAxis>();
size_t nr_inp = inputs.size();
mgb_assert(nr_inp == 1, "RemoveAxis expects 1 inputs; got %lu actually", nr_inp);
auto&& src = inputs[0];
auto tlayout = src->layout();
for (auto&& i : op_def.axis) {
if (tlayout.ndim == 1) {
mgb_assert(
tlayout.shape[0] == 1 && i == 0,
"can not remove axis %u from tensor of shape=%s", i,
tlayout.megdnn::TensorShape::to_string().c_str());
} else {
mgb_assert(
i < tlayout.ndim && tlayout.shape[i] == 1,
"can not remove axis %u from tensor of shape=%s", i,
tlayout.megdnn::TensorShape::to_string().c_str());
tlayout.remove_axis_inplace(i);
}
}
// memory forward
return {Tensor::make(src->blob(), src->offset(), tlayout)};
}

OP_TRAIT_REG(RemoveAxis, RemoveAxis)
.apply_on_var_node(apply_on_var_node)
.apply_on_physical_tensor(apply_on_physical_tensor)
.fallback();
} // namespace remove_axis } // namespace remove_axis
} // namespace } // namespace




+ 1
- 1
imperative/src/impl/ops/utility.cpp View File

@@ -411,7 +411,7 @@ struct ComputingGraphHolder {
executable->wait(); executable->wait();
size_t nr_inputs = inputs.size(); size_t nr_inputs = inputs.size();
for (size_t i = 0; i < nr_inputs; ++i) { for (size_t i = 0; i < nr_inputs; ++i) {
auto input_dev_tensor = input_tensors[i]->dev_tensor();
auto input_dev_tensor = input_tensors[i]->dev_tensor(false);
inputs[i].device_value->reset( inputs[i].device_value->reset(
input_dev_tensor.storage(), input_dev_tensor.layout()); input_dev_tensor.storage(), input_dev_tensor.layout());
if (inputs[i].host_value) { if (inputs[i].host_value) {


+ 78
- 15
imperative/src/impl/physical_tensor.cpp View File

@@ -95,7 +95,13 @@ const Blob::RawStorage& Blob::storage() {


Tensor::Tensor( Tensor::Tensor(
BlobPtr blob, const TensorLayout& layout, size_t offset, const HostTensorND& hv) BlobPtr blob, const TensorLayout& layout, size_t offset, const HostTensorND& hv)
: m_layout(layout), m_blob(std::move(blob)), m_offset(offset), m_value(hv) {}
: m_cn(blob->comp_node()),
m_shape(layout),
m_dtype(layout.dtype),
m_layout(layout),
m_blob(std::move(blob)),
m_offset(offset),
m_value(hv) {}


Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) { Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
constexpr int size_threshold = TensorShape::MAX_NDIM; constexpr int size_threshold = TensorShape::MAX_NDIM;
@@ -107,7 +113,12 @@ Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
MGB_RECORD_EVENT( MGB_RECORD_EVENT(
profiler::HostToDeviceEvent, hv.layout(), hv.comp_node(), hv.raw_ptr(), profiler::HostToDeviceEvent, hv.layout(), hv.comp_node(), hv.raw_ptr(),
dev_tensor().raw_ptr()); dev_tensor().raw_ptr());
dev_tensor().copy_from_fixlayout(hv);
DeviceTensorStorage storage;
storage.reset(m_cn, m_blob->size(), m_blob->storage());
storage = storage.sub(m_offset);
DeviceTensorND dv;
dv.reset(storage, m_layout);
dv.copy_from_fixlayout(hv);
// even though hv is saved in m_value, Tensor itself could be // even though hv is saved in m_value, Tensor itself could be
// released before copy completes // released before copy completes
MGB_RECORD_EVENT( MGB_RECORD_EVENT(
@@ -117,25 +128,36 @@ Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
} }
} }


Tensor::Tensor(const DeviceTensorND& dv, const HostTensorND& hv) {
Tensor::Tensor(const DeviceTensorND& dv, const HostTensorND& hv)
: m_offset(dv.storage().offset()),
m_cn(dv.comp_node()),
m_shape(dv.layout()),
m_dtype(dv.layout().dtype),
m_blob(Blob::make(dv.storage())),
m_layout(dv.layout()) {
if (!hv.empty()) { if (!hv.empty()) {
mgb_assert(dv.comp_node() == hv.comp_node()); mgb_assert(dv.comp_node() == hv.comp_node());
mgb_assert(dv.dtype() == hv.dtype()); mgb_assert(dv.dtype() == hv.dtype());
mgb_assert(dv.shape().eq_shape(hv.shape())); mgb_assert(dv.shape().eq_shape(hv.shape()));
m_value = hv; m_value = hv;
} }
m_layout = dv.layout();
m_blob = Blob::make(dv.storage());
m_offset = dv.storage().offset();
} }


Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) Tensor::Tensor(const TensorLayout& layout, const CompNode& cn)
: m_layout{layout}, : m_layout{layout},
m_blob{Blob::make(cn, layout.span().dist_byte())}, m_blob{Blob::make(cn, layout.span().dist_byte())},
m_offset{0} {}
m_offset{0},
m_cn(cn),
m_shape(layout),
m_dtype(layout.dtype) {}


Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout)
: m_layout{layout}, m_blob{blob}, m_offset{offset} {}
: m_layout{layout},
m_blob{blob},
m_offset{offset},
m_cn(blob->comp_node()),
m_shape(layout),
m_dtype(layout.dtype) {}


TensorPtr Tensor::make(const HostTensorND& hv) { TensorPtr Tensor::make(const HostTensorND& hv) {
auto&& blob = MultiCNConstTensorCache::inst().lookup(hv); auto&& blob = MultiCNConstTensorCache::inst().lookup(hv);
@@ -145,10 +167,45 @@ TensorPtr Tensor::make(const HostTensorND& hv) {
return std::make_shared<Tensor>(hv); return std::make_shared<Tensor>(hv);
} }


DeviceTensorND Tensor::dev_tensor() {
void Tensor::to_contiguous_inplace(VarNode::LayoutConstraintCallback& layout_checker) {
MGB_LOCK_GUARD(m_blob_mtx);
if (!m_layout.is_empty() && !layout_checker(m_layout)) {
DeviceTensorStorage storage;
storage.reset(m_cn, m_blob->size(), m_blob->storage());
storage = storage.sub(m_offset);
DeviceTensorND dv;
dv.reset(storage, m_layout);

DeviceTensorND dv_contig;
dv_contig.copy_from(dv);
m_layout = dv_contig.layout();
std::atomic_store(&m_blob, Blob::make(dv_contig.storage()));
mgb_assert(m_layout.is_contiguous());
m_offset = 0;
}
}

void Tensor::to_contiguous_inplace() {
static VarNode::LayoutConstraintCallback default_cb =
[](const TensorLayout& layout) { return layout.is_contiguous(); };
to_contiguous_inplace(default_cb);
}

void Tensor::assign_from_dev_tensor(DeviceTensorND dv) {
MGB_LOCK_GUARD(m_blob_mtx);
std::atomic_store(&m_blob, Blob::make(dv.storage()));
m_offset = dv.storage().offset();
m_layout = dv.layout();
}

DeviceTensorND Tensor::dev_tensor(bool contiguous) {
mgb_assert(m_blob, "uninitialized tensor."); mgb_assert(m_blob, "uninitialized tensor.");
if (contiguous) {
to_contiguous_inplace();
}
MGB_LOCK_GUARD(m_blob_mtx);
DeviceTensorStorage storage; DeviceTensorStorage storage;
storage.reset(m_blob->comp_node(), m_blob->size(), m_blob->storage());
storage.reset(m_cn, m_blob->size(), m_blob->storage());
storage = storage.sub(m_offset); storage = storage.sub(m_offset);
DeviceTensorND ret; DeviceTensorND ret;
ret.reset(storage, m_layout); ret.reset(storage, m_layout);
@@ -156,16 +213,22 @@ DeviceTensorND Tensor::dev_tensor() {
} }


void Tensor::fetch_value() { void Tensor::fetch_value() {
MGB_LOCK_GUARD(m_mtx);
MGB_LOCK_GUARD(m_blob_mtx);
MGB_LOCK_GUARD(m_value_mtx);
if (m_value.empty()) { if (m_value.empty()) {
m_value.copy_from(dev_tensor());
DeviceTensorStorage storage;
storage.reset(m_cn, m_blob->size(), m_blob->storage());
storage = storage.sub(m_offset);
DeviceTensorND dv;
dv.reset(storage, m_layout);
m_value.copy_from(dv);
m_value_ready.reset(EventPool::without_timer().alloc(comp_node())); m_value_ready.reset(EventPool::without_timer().alloc(comp_node()));
m_value_ready->record(); m_value_ready->record();
} }
} }


bool Tensor::value_fetched() { bool Tensor::value_fetched() {
MGB_LOCK_GUARD(m_mtx);
MGB_LOCK_GUARD(m_value_mtx);
return m_value.layout().ndim != 0; return m_value.layout().ndim != 0;
} }


@@ -178,7 +241,7 @@ const HostTensorND& Tensor::get_value() {
} }


const HostTensorND* Tensor::try_get_value() { const HostTensorND* Tensor::try_get_value() {
MGB_LOCK_GUARD(m_mtx);
MGB_LOCK_GUARD(m_value_mtx);
if (!m_value.empty() && (!m_value_ready || m_value_ready->finished())) { if (!m_value.empty() && (!m_value_ready || m_value_ready->finished())) {
return &m_value; return &m_value;
} }
@@ -193,7 +256,7 @@ TensorPtr Tensor::make_scalar(DTypeScalar value, CompNode cn) {
} }


TensorPtr Tensor::sub(size_t offset, TensorShape shape) { TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
TensorLayout layout(shape, m_layout.dtype);
TensorLayout layout(shape, m_dtype);
return Tensor::make(m_blob, offset + m_offset, layout); return Tensor::make(m_blob, offset + m_offset, layout);
} }




+ 2
- 5
imperative/src/impl/proxy_graph.cpp View File

@@ -73,7 +73,7 @@ public:
static SymbolVar make(ComputingGraph& graph, Tensor& tensor) { static SymbolVar make(ComputingGraph& graph, Tensor& tensor) {
auto opr = graph.insert_opr(std::make_unique<InputPlaceholder>(graph, &tensor)); auto opr = graph.insert_opr(std::make_unique<InputPlaceholder>(graph, &tensor));
auto var = opr->output(0); auto var = opr->output(0);
auto&& dev_tensor = tensor.dev_tensor();
auto&& dev_tensor = tensor.dev_tensor(false);
var->m_comp_node = dev_tensor.comp_node(); var->m_comp_node = dev_tensor.comp_node();
var->m_shape = dev_tensor.shape(); var->m_shape = dev_tensor.shape();
if (dev_tensor.empty()) { if (dev_tensor.empty()) {
@@ -81,10 +81,7 @@ public:
layout.init_contiguous_stride(); layout.init_contiguous_stride();
dev_tensor.reset(dev_tensor.storage(), layout); dev_tensor.reset(dev_tensor.storage(), layout);
} }
var->m_dev_tensor = dev_tensor;
var->m_mem_plan.reset_from_owner_var()
.chunk()
.mem_alloc_status.set_from_owner_var();
var->force_assign_dev_tensor_from_tensor(dev_tensor);
return var; return var;
} }




+ 15
- 16
imperative/src/impl/proxy_graph/mini_graph.h View File

@@ -314,15 +314,11 @@ public:
size_t idx = 0; size_t idx = 0;
for (auto&& input : opr_inputs) { for (auto&& input : opr_inputs) {
mgb_assert(input->owner_opr()->same_type<InputPlaceholder>()); mgb_assert(input->owner_opr()->same_type<InputPlaceholder>());
input->m_dev_tensor.storage({});
auto&& dev_tensor = inputs[input_remap[idx]]->dev_tensor();
auto&& dev_tensor = inputs[input_remap[idx]]->dev_tensor(false);
auto&& layout = dev_tensor.layout(); auto&& layout = dev_tensor.layout();
input->shape(dev_tensor.shape()); input->shape(dev_tensor.shape());


auto&& chk = input->m_mem_plan.reset_from_owner_var().chunk();
input->m_dev_tensor.reset(dev_tensor.storage(), layout);
input->m_mem_plan.layout(layout);
chk.mem_alloc_status.set_from_owner_var();
input->force_assign_dev_tensor_from_tensor(dev_tensor);


mgb_assert(input->comp_node() == dev_tensor.comp_node()); mgb_assert(input->comp_node() == dev_tensor.comp_node());
mgb_assert(input->shape().eq_shape(layout)); mgb_assert(input->shape().eq_shape(layout));
@@ -335,9 +331,14 @@ public:
mgb_assert(m_opr->usable_output().size() == outputs.size()); mgb_assert(m_opr->usable_output().size() == outputs.size());
::mgb::opr::intl::WorkspaceLimitHook::set_impl( ::mgb::opr::intl::WorkspaceLimitHook::set_impl(
m_opr->owner_graph(), get_workspace_limit); m_opr->owner_graph(), get_workspace_limit);
size_t j = 0;
for (auto&& var : m_opr->output()) { for (auto&& var : m_opr->output()) {
auto&& chk = var->m_mem_plan.reset_from_owner_var().chunk(); auto&& chk = var->m_mem_plan.reset_from_owner_var().chunk();
chk.mem_alloc_status.set_from_owner_var();
}
m_opr->mem_plan_fwd_in2out_readonly();
size_t j = 0;
for (auto&& var : m_opr->output()) {
if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) {
TensorLayout layout{var->shape(), var->dtype(), var->format()}; TensorLayout layout{var->shape(), var->dtype(), var->format()};
var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag(
@@ -349,18 +350,16 @@ public:
mgb_assert(var->comp_node() == tensor->comp_node()); mgb_assert(var->comp_node() == tensor->comp_node());
mgb_assert(var->shape().eq_shape(layout)); mgb_assert(var->shape().eq_shape(layout));
mgb_assert(var->dtype() == layout.dtype); mgb_assert(var->dtype() == layout.dtype);
var->assign_dev_tensor_from_tensor(tensor->dev_tensor());
if (var->m_mem_plan.chunk().owner_var != var) {
tensor->assign_from_dev_tensor(
var->m_dev_tensor); // memory forwarding
} else {
var->assign_dev_tensor_from_tensor(tensor->dev_tensor());
}
++j; ++j;
} }
chk.mem_alloc_status.set_from_owner_var();
} }
mgb_assert(j == outputs.size()); mgb_assert(j == outputs.size());

// Memory forwarding was bypassed in megbrain with graph option
// imerative_proxy_graph on, here we call mem_plan_fwd_in2out_readonly
// to initialize some opr(e.g. Subtensor)'s internal state
// TODO: implement memory forwarding
m_opr->mem_plan_fwd_in2out_readonly();
{ {
// some opr (e.g. Reduce) rely on on_mem_status_changed to set // some opr (e.g. Reduce) rely on on_mem_status_changed to set
// input/output tensor corretly, since we bypass var_node_mem_mgr // input/output tensor corretly, since we bypass var_node_mem_mgr
@@ -840,7 +839,7 @@ public:
Tensor::make(output_descs[i].layout, output_descs[i].comp_node); Tensor::make(output_descs[i].layout, output_descs[i].comp_node);
} }


auto raw_outputs = to_raw_ptr_array(outputs);
auto raw_outputs = to_raw_ptr_array(outputs, false);
CompNode::UnorderedSet used_cns; CompNode::UnorderedSet used_cns;
for (auto&& out : raw_outputs) { for (auto&& out : raw_outputs) {
auto cn = out->comp_node(); auto cn = out->comp_node();


+ 81
- 0
imperative/src/impl/proxy_graph/proxy_graph.cpp View File

@@ -9,8 +9,12 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/ */


#include "../mgb_cg_impl.h"
#include "./mini_graph.h" #include "./mini_graph.h"
#include "megbrain/opr/io.h"


using LayoutConstraintLevel = mgb::cg::VarNodeMemManager::LayoutConstraintLevel;
using LayoutConstraintCallback = mgb::VarNode::LayoutConstraintCallback;
namespace mgb::imperative::proxy_graph { namespace mgb::imperative::proxy_graph {
MGB_DYN_TYPE_OBJ_FINAL_IMPL(ProxyGraph::InputPlaceholder); MGB_DYN_TYPE_OBJ_FINAL_IMPL(ProxyGraph::InputPlaceholder);


@@ -34,4 +38,81 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
return ret; return ret;
} }


std::unordered_map<size_t, SmallVector<LayoutConstraintCallback>>
input_layout_constraints_cache;

SmallVector<LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
auto get_input_layout_constraint_hash_key =
[](const OpDef& def, const SmallVector<TensorPtr>& inputs) {
XXHash state;
size_t length = 0, data[1 + inputs.size()];
data[length++] = def.hash();
for (auto&& i : inputs) {
data[length++] = mgb::hash(i->comp_node());
}
state.update(data, length * sizeof(size_t));
return state.digest();
};
auto hash_key = get_input_layout_constraint_hash_key(def, inputs);
auto&& iter = input_layout_constraints_cache.find(hash_key);
if (iter != input_layout_constraints_cache.end()) {
return iter->second;
}
static cg::ComputingGraphImpl* graph =
imperative::ResourceManager::create_global<cg::ComputingGraphImpl>();
VarNodeArray vinputs(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
OperatorNodeConfig config;
auto&& layout = inputs[i]->layout();
layout.init_contiguous_stride();
vinputs[i] = graph->insert_opr(std::make_unique<mgb::opr::SharedDeviceTensor>(
*graph,
std::make_shared<DeviceTensorND>(
inputs[i]->comp_node(), layout),
false, config))
->output(0);
}
auto&& opr = OpDef::apply_on_var_node(def, vinputs)[0]->owner_opr();
opr->add_input_layout_constraint();

SmallVector<LayoutConstraintCallback> res(inputs.size());
auto& mem_mgr = graph->var_node_mem_manager();
for (size_t i = 0; i < vinputs.size(); ++i) {
auto& trait = mem_mgr.get_var_node_mem_trait(vinputs[i]);
switch (trait.layout_constraint.level) {
case LayoutConstraintLevel::CONTIG:
res[i] = [](const TensorLayout& layout) {
return layout.is_contiguous();
};
break;
case LayoutConstraintLevel::MONOTONE:
res[i] = [&trait](const TensorLayout& layout) {
if (!layout.is_abs_monotonous_allow_brdcst()) {
return false;
}
for (auto&& i : trait.layout_constraint.custom)
if (!i(layout))
return false;
return true;
};
break;
case LayoutConstraintLevel::NONE:
if (!trait.layout_constraint.custom.empty()) {
res[i] = [&trait](const TensorLayout& layout) {
for (auto&& i : trait.layout_constraint.custom)
if (!i(layout))
return false;
return true;
};
}
break;
default:
mgb_throw(InternalError, "invalid layout_constraint_level");
}
}
input_layout_constraints_cache.emplace(hash_key, res);
return res;
}

} // namespace mgb::imperative::proxy_graph_detail } // namespace mgb::imperative::proxy_graph_detail

+ 15
- 0
imperative/src/impl/subgraph_detail.cpp View File

@@ -17,6 +17,8 @@


#include "./op_trait.h" #include "./op_trait.h"


using LayoutConstraintCallback = mgb::VarNode::LayoutConstraintCallback;

namespace mgb { namespace mgb {
namespace imperative { namespace imperative {
namespace subgraph_detail { namespace subgraph_detail {
@@ -73,6 +75,13 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
const std::shared_ptr<OpDef>& op, const std::shared_ptr<OpDef>& op,
const SmallVector<TensorPtr>& inputs, const SmallVector<TensorPtr>& inputs,
size_t nr_outputs) { size_t nr_outputs) {
auto&& constraints = OpDef::get_input_layout_constraint(*op, inputs);
for (size_t idx = 0; idx < inputs.size(); ++idx) {
auto&& layout_checker = constraints[idx];
if (layout_checker) {
inputs[idx]->to_contiguous_inplace(layout_checker);
}
}
// do not use infered output_desc in subgraph // do not use infered output_desc in subgraph
return OpDef::apply_on_physical_tensor(*op, inputs, output_descs, false); return OpDef::apply_on_physical_tensor(*op, inputs, output_descs, false);
}; };
@@ -81,6 +90,12 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
return outputs; return outputs;
} }


SmallVector<LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs) {
SmallVector<LayoutConstraintCallback> res(inputs.size());
return res;
}

static EncodedSubgraph make_backward_graph_from_forward( static EncodedSubgraph make_backward_graph_from_forward(
const SmallVector<LogicalTensorDesc>& inputs, const SmallVector<LogicalTensorDesc>& inputs,
const SmallVector<bool>& input_requires_grad, const SmallVector<bool>& input_requires_grad,


+ 3
- 0
imperative/src/include/megbrain/imperative/op_def.h View File

@@ -78,6 +78,9 @@ public:
static EncodedSubgraph make_forward_graph( static EncodedSubgraph make_forward_graph(
const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs); const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs);


static SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs);

const OpTrait* trait() const; const OpTrait* trait() const;


std::string to_string() const; std::string to_string() const;


+ 21
- 8
imperative/src/include/megbrain/imperative/physical_tensor.h View File

@@ -14,6 +14,7 @@
#include <memory> #include <memory>
#include <mutex> #include <mutex>


#include "megbrain/graph.h"
#include "megbrain/imperative/resource_manager.h" #include "megbrain/imperative/resource_manager.h"
#include "megbrain/tensor.h" #include "megbrain/tensor.h"


@@ -90,18 +91,24 @@ public:


CompNode comp_node() const { CompNode comp_node() const {
mgb_assert(m_blob, "uninitialized tensor."); mgb_assert(m_blob, "uninitialized tensor.");
return m_blob->comp_node();
return m_cn;
} }


DType dtype() const { return m_layout.dtype; }
DType dtype() const { return m_dtype; }


TensorLayout layout() const { return m_layout; } TensorLayout layout() const { return m_layout; }


const TensorShape& shape() const { return m_layout; }
const TensorShape& shape() const { return m_shape; }


size_t offset() const { return m_offset; } size_t offset() const { return m_offset; }


DeviceTensorND dev_tensor();
void to_contiguous_inplace(VarNode::LayoutConstraintCallback&);

void to_contiguous_inplace();

DeviceTensorND dev_tensor(bool contiguous = true);

void assign_from_dev_tensor(DeviceTensorND);


static TensorPtr make_scalar(DTypeScalar value, CompNode cn); static TensorPtr make_scalar(DTypeScalar value, CompNode cn);


@@ -110,7 +117,7 @@ public:
return make_scalar(value, m_blob->comp_node()); return make_scalar(value, m_blob->comp_node());
} }


BlobPtr& blob() { return m_blob; }
BlobPtr blob() { return m_blob; }


void fetch_value(); void fetch_value();
bool value_fetched(); bool value_fetched();
@@ -131,10 +138,16 @@ public:
static void static_initialize(); static void static_initialize();


private: private:
TensorLayout m_layout;
BlobPtr m_blob;
size_t m_offset; size_t m_offset;
std::mutex m_mtx;
const CompNode m_cn;
const TensorShape m_shape;
const DType m_dtype;

std::mutex m_blob_mtx;
BlobPtr m_blob;
TensorLayout m_layout;

std::mutex m_value_mtx;
HostTensorND m_value; HostTensorND m_value;
EventPtr m_value_ready = nullptr; EventPtr m_value_ready = nullptr;
}; };


+ 3
- 0
imperative/src/include/megbrain/imperative/proxy_graph_detail.h View File

@@ -33,6 +33,9 @@ EncodedSubgraph make_backward_graph(
const SmallVector<bool>& input_requires_grad, const SmallVector<bool>& input_requires_grad,
const SmallVector<bool>& output_has_grad); const SmallVector<bool>& output_has_grad);


SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs);

} // namespace proxy_graph_detail } // namespace proxy_graph_detail
} // namespace imperative } // namespace imperative
} // namespace mgb } // namespace mgb


+ 3
- 0
imperative/src/include/megbrain/imperative/subgraph_detail.h View File

@@ -36,6 +36,9 @@ EncodedSubgraph make_backward_graph(
const SmallVector<bool>& input_requires_grad, const SmallVector<bool>& input_requires_grad,
const SmallVector<bool>& output_has_grad); const SmallVector<bool>& output_has_grad);


SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint(
const OpDef& def, const SmallVector<TensorPtr>& inputs);

} // namespace subgraph_detail } // namespace subgraph_detail
} // namespace imperative } // namespace imperative
} // namespace mgb } // namespace mgb

+ 1
- 1
src/core/impl/graph/cg_impl.cpp View File

@@ -322,7 +322,7 @@ void ComputingGraphImpl::free_varnode_storage(void* ptr) {
m_var_node_pool.free_raw(ptr); m_var_node_pool.free_raw(ptr);
}; };


OperatorNodeBase* ComputingGraphImpl::insert_opr(
MGE_WIN_DECLSPEC_FUC OperatorNodeBase* ComputingGraphImpl::insert_opr(
std::unique_ptr<OperatorNodeBase> opr_uniqp) { std::unique_ptr<OperatorNodeBase> opr_uniqp) {
auto opr = opr_uniqp.get(); auto opr = opr_uniqp.get();




+ 4
- 3
src/core/impl/graph/cg_impl.h View File

@@ -148,8 +148,8 @@ class ComputingGraphImpl final : public ComputingGraph {
public: public:
class ComputingSequence; class ComputingSequence;


ComputingGraphImpl();
~ComputingGraphImpl();
MGE_WIN_DECLSPEC_FUC ComputingGraphImpl();
MGE_WIN_DECLSPEC_FUC ~ComputingGraphImpl();


template <typename T> template <typename T>
static ComputingGraphImpl* downcast(T* ptr) = delete; static ComputingGraphImpl* downcast(T* ptr) = delete;
@@ -166,7 +166,8 @@ public:
SmallVector<std::unique_ptr<AsyncExecutable>> compile_multi_part( SmallVector<std::unique_ptr<AsyncExecutable>> compile_multi_part(
const SmallVector<OutputSpec>& out_specs) override; const SmallVector<OutputSpec>& out_specs) override;


OperatorNodeBase* insert_opr(std::unique_ptr<OperatorNodeBase> opr) override;
MGE_WIN_DECLSPEC_FUC OperatorNodeBase* insert_opr(
std::unique_ptr<OperatorNodeBase> opr) override;


void* alloc_varnode_storage() override; void* alloc_varnode_storage() override;




+ 30
- 1
src/core/impl/graph/var_node.cpp View File

@@ -93,6 +93,23 @@ MemAllocPlan& MemAllocPlan::assign_for_forward(
return *this; return *this;
} }


MemAllocPlan& MemAllocPlan::force_assign_for_forward(
const MemAllocPlan& src, const SubTensorSpec& sub) {
mgb_assert(valid() && src.valid() && m_layout.eq_shape(sub.layout()));
++(m_chunk = src.m_chunk)->m_refcnt;
m_layout = sub.layout();
// make layout strong-contig
for (int i = static_cast<int>(m_layout.ndim) - 1; i >= 0; --i) {
if (m_layout.shape[i] == 1) {
m_layout.stride[i] = i + 1 < static_cast<int>(m_layout.ndim)
? m_layout.stride[i + 1] * m_layout.shape[i + 1]
: 1;
}
}
m_layout.dtype = dtype();
return *this;
}

MemAllocPlan& MemAllocPlan::reset_from_owner_var() { MemAllocPlan& MemAllocPlan::reset_from_owner_var() {
auto owner_var = m_chunk_storage.owner_var; auto owner_var = m_chunk_storage.owner_var;
m_layout.dtype = dtype(); m_layout.dtype = dtype();
@@ -223,7 +240,12 @@ VarNode& VarNode::format(TensorFormat format) {


bool VarNode::set_fwd_in2out_readonly(VarNode* input, const SubTensorSpec& sub) { bool VarNode::set_fwd_in2out_readonly(VarNode* input, const SubTensorSpec& sub) {
if (owner_graph()->options().imperative_proxy_graph) { if (owner_graph()->options().imperative_proxy_graph) {
return false;
if (input->comp_node() != comp_node()) {
return false;
}
m_mem_plan.force_assign_for_forward(input->m_mem_plan, sub);
m_dev_tensor = input->dev_tensor().sub(sub);
return true;
} }
return ComputingGraphImpl::downcast(owner_graph()) return ComputingGraphImpl::downcast(owner_graph())
->var_node_mem_manager() ->var_node_mem_manager()
@@ -361,6 +383,13 @@ VarNode& VarNode::reset_dev_tensor_from_tensor(const DeviceTensorND& value) {
return *this; return *this;
} }


void VarNode::force_assign_dev_tensor_from_tensor(const DeviceTensorND& value) {
m_dev_tensor = value;
shape(value.shape());
m_mem_plan.reset_from_owner_var().chunk().mem_alloc_status.set_from_owner_var();
m_mem_plan.layout(value.layout());
}

void VarNode::assign_dev_tensor_from_tensor(const DeviceTensorND& value) { void VarNode::assign_dev_tensor_from_tensor(const DeviceTensorND& value) {
mgb_assert( mgb_assert(
(value.layout().is_contiguous() || value.empty()) && (value.layout().is_contiguous() || value.empty()) &&


+ 1
- 1
src/core/impl/tensor.cpp View File

@@ -475,7 +475,7 @@ DEF(CompNode node, const TensorShape& shape, DType dtype, TensorFormat format)
DEF(CompNode node, const TensorLayout& layout) DEF(CompNode node, const TensorLayout& layout)
: TensorND(node, layout, layout.dtype, layout.format) { : TensorND(node, layout, layout.dtype, layout.format) {
mgb_assert( mgb_assert(
layout.is_contiguous(),
layout.is_contiguous() || layout.is_empty(),
"non-contiguous layout used for initializing a tensor: %s", "non-contiguous layout used for initializing a tensor: %s",
layout.to_string().c_str()); layout.to_string().c_str());
} }


+ 2
- 1
src/core/include/megbrain/graph/cg.h View File

@@ -241,7 +241,8 @@ public:
* \return the node in the graph (maybe another node due to * \return the node in the graph (maybe another node due to
* deduplication) * deduplication)
*/ */
virtual OperatorNodeBase* insert_opr(std::unique_ptr<OperatorNodeBase> opr) = 0;
MGE_WIN_DECLSPEC_FUC virtual OperatorNodeBase* insert_opr(
std::unique_ptr<OperatorNodeBase> opr) = 0;


/*! /*!
* \brief used by OperatorNodeBase to allocate its outputs * \brief used by OperatorNodeBase to allocate its outputs


+ 7
- 0
src/core/include/megbrain/graph/var_node.h View File

@@ -194,6 +194,10 @@ public:
MGE_WIN_DECLSPEC_FUC MemAllocPlan& assign_for_forward( MGE_WIN_DECLSPEC_FUC MemAllocPlan& assign_for_forward(
const MemAllocPlan& src, const SubTensorSpec& sub); const MemAllocPlan& src, const SubTensorSpec& sub);


//! force assign for readonly forward
MGE_WIN_DECLSPEC_FUC MemAllocPlan& force_assign_for_forward(
const MemAllocPlan& src, const SubTensorSpec& sub);

/*! /*!
* \brief next readonly-forward reader of this MemAllocPlan * \brief next readonly-forward reader of this MemAllocPlan
* *
@@ -509,6 +513,9 @@ public:
//! NO_SYS_MEM_ALLOC can be modified. //! NO_SYS_MEM_ALLOC can be modified.
MGE_WIN_DECLSPEC_FUC bool is_graph_dest_varnode(); MGE_WIN_DECLSPEC_FUC bool is_graph_dest_varnode();


MGE_WIN_DECLSPEC_FUC void force_assign_dev_tensor_from_tensor(
const DeviceTensorND& value);

private: private:
//! whether its memory should be allocated by mgb system during graph //! whether its memory should be allocated by mgb system during graph
//! execution; initialized in VarNodeMemManager::reset_opr_seq() //! execution; initialized in VarNodeMemManager::reset_opr_seq()


+ 13
- 11
src/opr/include/megbrain/opr/io.h View File

@@ -24,7 +24,7 @@ namespace intl {
* \brief base class for IO nodes between device and host * \brief base class for IO nodes between device and host
*/ */
class HostIONodeBase : public cg::SingleCNOperatorNodeBase { class HostIONodeBase : public cg::SingleCNOperatorNodeBase {
void init_output_static_infer_desc() override final;
MGE_WIN_DECLSPEC_FUC void init_output_static_infer_desc() override final;


protected: protected:
using cg::SingleCNOperatorNodeBase::SingleCNOperatorNodeBase; using cg::SingleCNOperatorNodeBase::SingleCNOperatorNodeBase;
@@ -32,9 +32,10 @@ protected:
/*! /*!
* \brief src_type for static shape and value infer * \brief src_type for static shape and value infer
*/ */
virtual cg::static_infer::SourceType static_infer_src_type() const;
MGE_WIN_DECLSPEC_FUC virtual cg::static_infer::SourceType static_infer_src_type()
const;


virtual const TensorShape& get_output_shape() = 0;
MGE_WIN_DECLSPEC_FUC virtual const TensorShape& get_output_shape() = 0;


/*! /*!
* \brief fill value in *dest* for static inference * \brief fill value in *dest* for static inference
@@ -52,10 +53,10 @@ protected:
class DeviceTensorHolder : public HostIONodeBase { class DeviceTensorHolder : public HostIONodeBase {
class DevValueExecDep; class DevValueExecDep;


void init_output_format() override;
void init_output_mem_plan(bool dynamic) override final;
void scn_do_execute() override final;
void record_execute_deps(ExecDependencyArray& deps) override;
MGE_WIN_DECLSPEC_FUC void init_output_format() override;
MGE_WIN_DECLSPEC_FUC void init_output_mem_plan(bool dynamic) override final;
MGE_WIN_DECLSPEC_FUC void scn_do_execute() override final;
MGE_WIN_DECLSPEC_FUC void record_execute_deps(ExecDependencyArray& deps) override;


protected: protected:
using HostIONodeBase::HostIONodeBase; using HostIONodeBase::HostIONodeBase;
@@ -77,20 +78,20 @@ MGB_DEFINE_CLS_WITH_SUPER(SharedDeviceTensorBase, DeviceTensorHolder) // {
std::shared_ptr<DeviceTensorND> m_dev_data; std::shared_ptr<DeviceTensorND> m_dev_data;
bool m_const_value; bool m_const_value;


const TensorShape& get_output_shape() override;
MGE_WIN_DECLSPEC_FUC const TensorShape& get_output_shape() override;


bool fill_in_static_infer(DeviceTensorND* dest) override { bool fill_in_static_infer(DeviceTensorND* dest) override {
MGB_MARK_USED_VAR(dest); MGB_MARK_USED_VAR(dest);
return false; return false;
} }


void init_output_comp_node() override;
MGE_WIN_DECLSPEC_FUC void init_output_comp_node() override;


public: public:
//! const_value marks whether the device value of this operator should //! const_value marks whether the device value of this operator should
//! be treated as constant during graph execution. Should be false in //! be treated as constant during graph execution. Should be false in
//! most cases. //! most cases.
SharedDeviceTensorBase(
MGE_WIN_DECLSPEC_FUC SharedDeviceTensorBase(
ComputingGraph& graph, const std::shared_ptr<DeviceTensorND>& dev_data, ComputingGraph& graph, const std::shared_ptr<DeviceTensorND>& dev_data,
bool const_value, const OperatorNodeConfig& config); bool const_value, const OperatorNodeConfig& config);


@@ -248,7 +249,8 @@ private:
*/ */
MGB_DEFINE_OPR_CLASS_WITH_EXPORT( MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
SharedDeviceTensor, intl::SharedDeviceTensorBase) // { SharedDeviceTensor, intl::SharedDeviceTensorBase) // {
cg::static_infer::SourceType static_infer_src_type() const override;
MGE_WIN_DECLSPEC_FUC cg::static_infer::SourceType static_infer_src_type()
const override;


public: public:
using Super::Super; using Super::Super;


Loading…
Cancel
Save