GitOrigin-RevId: 8ef62baf79
release-1.6
@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, | |||
TensorND src = src0, dst = dst0; | |||
check_layout_and_canonize(src.layout, dst.layout); | |||
// FIXME: optimize for lowbit cases | |||
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); | |||
return; | |||
} | |||
relayout::TransposeParam trans_param; | |||
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | |||
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | |||
@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, | |||
TensorND src = src0, dst = dst0; | |||
check_layout_and_canonize(src.layout, dst.layout); | |||
// FIXME: optimize for lowbit cases | |||
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); | |||
return; | |||
} | |||
relayout::TransposeParam trans_param; | |||
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | |||
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | |||
@@ -0,0 +1,313 @@ | |||
/** | |||
* \file src/gopt/impl/opr_format_modifier.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "./opr_format_modifier.h" | |||
#include "megbrain/opr/dnn/convolution.h" | |||
#include "megbrain/opr/dnn/pooling.h" | |||
#include "megbrain/opr/imgproc.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/serialization/sereg.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megbrain_opr_format_modifier) | |||
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) { | |||
#define MIDOUT_E \ | |||
} \ | |||
MIDOUT_END(); | |||
using namespace mgb; | |||
using namespace opr; | |||
namespace { | |||
template <class MegDNNConv = megdnn::Convolution> | |||
struct MakeConvCaller2 { | |||
template <typename Opr> | |||
static VarNode* make(const cg::VarNodeArray& inputs, | |||
const typename MegDNNConv::Param& param, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
const OperatorNodeConfig& config) { | |||
if (inputs.size() == 2) { | |||
return Opr::make(inputs[0], inputs[1], param, execution_policy, | |||
config) | |||
.node(); | |||
} | |||
return nullptr; | |||
} | |||
}; | |||
template <class MegDNNConv = megdnn::Convolution> | |||
struct MakeConvCaller3 { | |||
template <typename Opr> | |||
static VarNode* make(const cg::VarNodeArray& inputs, | |||
const typename MegDNNConv::Param& param, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
const OperatorNodeConfig& config) { | |||
if (inputs.size() == 3) { | |||
return Opr::make(inputs[0], inputs[1], inputs[2], param, | |||
execution_policy, config) | |||
.node(); | |||
} | |||
return nullptr; | |||
} | |||
}; | |||
template <class MegDNNConv = megdnn::Convolution> | |||
struct MakeConvCaller4 { | |||
template <typename Opr> | |||
static VarNode* make(const cg::VarNodeArray& inputs, | |||
const typename MegDNNConv::Param& param, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
const OperatorNodeConfig& config) { | |||
if (inputs.size() == 4) { | |||
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, | |||
execution_policy, config) | |||
.node(); | |||
} | |||
return nullptr; | |||
} | |||
}; | |||
template <class MegDNNConv = megdnn::Convolution> | |||
struct MakeConvCaller5 { | |||
template <typename Opr> | |||
static VarNode* make(const cg::VarNodeArray& inputs, | |||
const typename MegDNNConv::Param& param, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
const OperatorNodeConfig& config) { | |||
if (inputs.size() == 5) { | |||
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], | |||
inputs[4], param, execution_policy, config) | |||
.node(); | |||
} | |||
return nullptr; | |||
} | |||
}; | |||
template <class MegDNNConv = megdnn::Convolution> | |||
struct MakeConvCallerEmpty { | |||
template <typename Opr> | |||
static VarNode* make(const cg::VarNodeArray&, | |||
const typename MegDNNConv::Param&, | |||
const megdnn::param::ExecutionPolicy&, | |||
const OperatorNodeConfig&) { | |||
return nullptr; | |||
} | |||
}; | |||
template <class Opr, class Maker0, class MegDNNConv, | |||
class Maker1 = MakeConvCallerEmpty<MegDNNConv>, | |||
class Maker2 = MakeConvCallerEmpty<MegDNNConv>, | |||
typename ConvParam = megdnn::param::Convolution> | |||
struct ConvMakerImpl { | |||
static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
const OperatorNodeConfig& config) { | |||
VarNode* ret = Maker0::template make<Opr>(inputs, param, | |||
execution_policy, config); | |||
if (!ret) { | |||
ret = Maker1::template make<Opr>(inputs, param, execution_policy, | |||
config); | |||
} | |||
if (!ret) { | |||
ret = Maker2::template make<Opr>(inputs, param, execution_policy, | |||
config); | |||
} | |||
mgb_assert(ret); | |||
return ret; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvMaker; | |||
template <> | |||
struct ConvMaker<opr::Convolution> | |||
: public ConvMakerImpl<opr::Convolution, | |||
MakeConvCaller2<megdnn::Convolution>, | |||
megdnn::Convolution> {}; | |||
template <> | |||
struct ConvMaker<opr::ConvolutionBackwardData> | |||
: public ConvMakerImpl<opr::ConvolutionBackwardData, | |||
MakeConvCaller2<megdnn::Convolution>, | |||
megdnn::Convolution, | |||
MakeConvCaller3<megdnn::Convolution>> {}; | |||
template <> | |||
struct ConvMaker<opr::ConvBiasForward> | |||
: public ConvMakerImpl<opr::ConvBiasForward, | |||
MakeConvCaller2<megdnn::ConvBiasForward>, | |||
megdnn::ConvBiasForward, | |||
MakeConvCaller3<megdnn::ConvBiasForward>, | |||
MakeConvCaller4<megdnn::ConvBiasForward>, | |||
megdnn::param::ConvBias> {}; | |||
template <> | |||
struct ConvMaker<opr::BatchConvBiasForward> | |||
: public ConvMakerImpl<opr::BatchConvBiasForward, | |||
MakeConvCaller2<megdnn::BatchConvBiasForward>, | |||
megdnn::BatchConvBiasForward, | |||
MakeConvCaller3<megdnn::BatchConvBiasForward>, | |||
MakeConvCaller4<megdnn::BatchConvBiasForward>, | |||
megdnn::param::BatchConvBias> {}; | |||
#if 0 | |||
#include "../../opr/impl/internal/invoke.h" | |||
template <typename Opr> | |||
struct MultiAlgoOprTrait; | |||
#define APPLY(statement, ...) \ | |||
mgb::apply([&](const auto&... args) { return statement; }, \ | |||
std::tuple_cat(__VA_ARGS__)) | |||
#define INST(_Opr) \ | |||
template <> \ | |||
struct MultiAlgoOprTrait<_Opr> { \ | |||
static constexpr bool has_algo = true; \ | |||
using MegDNNOpr = megdnn::_Opr; \ | |||
static constexpr int arity = OprArityTrait<MegDNNOpr>::arity; \ | |||
using FixedTensorLayouts = std::array<TensorLayout, arity>; \ | |||
static bool has_available_algo(const VarNodeArray& i, \ | |||
const cg::OperatorNodeBase* opr_) { \ | |||
MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)), \ | |||
midout_iv(MGB_HASH_STR("has_available_algo"))) \ | |||
auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||
auto&& megdnn_opr = \ | |||
reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr()); \ | |||
FixedTensorLayouts array_layouts; \ | |||
size_t in = i.size() - 1; \ | |||
for (size_t idx = 0; idx < in; idx++) { \ | |||
const auto& v = i[idx]; \ | |||
array_layouts[idx] = \ | |||
TensorLayout{v->shape(), v->dtype(), v->format()}; \ | |||
} \ | |||
const auto& v = i[in]; \ | |||
array_layouts[arity - 1] = \ | |||
TensorLayout{v->shape(), v->dtype(), v->format()}; \ | |||
return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \ | |||
array_layouts); \ | |||
MIDOUT_E \ | |||
} \ | |||
}; | |||
INST(Convolution) | |||
INST(ConvBiasForward) | |||
INST(ConvolutionBackwardData) | |||
INST(PoolingForward) | |||
#undef APPLY | |||
#undef INST | |||
#endif | |||
} // namespace | |||
namespace mgb { | |||
namespace gopt { | |||
namespace intl { | |||
template <typename Opr> | |||
struct OprFormatModifier; | |||
#define INST(_Opr) \ | |||
template <> \ | |||
struct OprFormatModifier<_Opr> { \ | |||
using OprFormat = typename _Opr::Param::Format; \ | |||
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ | |||
const cg::OperatorNodeBase* opr_) { \ | |||
MIDOUT_B(_Opr) \ | |||
auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||
auto param = opr.param(); \ | |||
param.format = opr_format; \ | |||
return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \ | |||
opr.config()); \ | |||
MIDOUT_E \ | |||
} \ | |||
}; | |||
INST(Convolution); | |||
INST(ConvBiasForward); | |||
INST(ConvolutionBackwardData); | |||
INST(BatchConvBiasForward); | |||
#undef INST | |||
template <> | |||
struct OprFormatModifier<WarpPerspective> { | |||
using Opr = opr::WarpPerspective; | |||
using OprFormat = typename Opr::Param::Format; | |||
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, | |||
const cg::OperatorNodeBase* opr_) { | |||
MIDOUT_B(Opr) | |||
auto&& opr = opr_->cast_final_safe<Opr>(); | |||
auto param = opr.param(); | |||
param.format = opr_format; | |||
if (i.size() == 3) { | |||
return Opr::make(i[0], i[1], i[2], param, opr.config()).node(); | |||
} else { | |||
mgb_assert(i.size() == 4); | |||
return Opr::make(i[0], i[1], i[2], i[3], param, opr.config()) | |||
.node(); | |||
} | |||
MIDOUT_E | |||
} | |||
}; | |||
#define INST(_Opr, _arity) \ | |||
template <> \ | |||
struct OprFormatModifier<_Opr> { \ | |||
using OprFormat = typename _Opr::Param::Format; \ | |||
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ | |||
const cg::OperatorNodeBase* opr_) { \ | |||
MIDOUT_B(_Opr) \ | |||
auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||
auto param = opr.param(); \ | |||
param.format = opr_format; \ | |||
return serialization::OprMaker<_Opr, _arity>::make( \ | |||
param, i, *i[0]->owner_graph(), opr.config()) \ | |||
->output(0); \ | |||
MIDOUT_E \ | |||
} \ | |||
}; | |||
INST(PoolingForward, 1); | |||
INST(Resize, 2); | |||
#undef INST | |||
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, | |||
const VarNodeArray& i, | |||
const cg::OperatorNodeBase* opr) { | |||
#define cb(_Opr) \ | |||
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ | |||
return OprFormatModifier<_Opr>::make(opr_format, i, opr); \ | |||
} else | |||
FOREACH_FORMAT_AWARE_OPR(cb) { | |||
mgb_throw(InternalError, "invalid format aware operator(got:%s)", | |||
opr->dyn_typeinfo()->name); | |||
} | |||
#undef cb | |||
} | |||
#if 0 | |||
bool has_available_algo(const VarNodeArray& i, | |||
const cg::OperatorNodeBase* opr) { | |||
#define cb(_Opr) \ | |||
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ | |||
MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \ | |||
VarNodeArray _ = i; \ | |||
_.emplace_back(opr->output(0)); \ | |||
return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \ | |||
} else | |||
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) | |||
cb(PoolingForward) { | |||
mgb_throw(InternalError, "invalid multi-algo operator(got:%s)", | |||
opr->dyn_typeinfo()->name); | |||
} | |||
} | |||
#endif | |||
} // namespace intl | |||
} // namespace gopt | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,36 @@ | |||
/** | |||
* \file src/gopt/impl/opr_format_modifier.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megbrain/graph.h" | |||
#include "megbrain/opr/dnn/convolution.h" | |||
namespace mgb { | |||
namespace gopt { | |||
namespace intl { | |||
#define FOREACH_FORMAT_AWARE_OPR(cb) \ | |||
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \ | |||
cb(PoolingForward) cb(WarpPerspective) cb(Resize) | |||
#if 0 | |||
bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr); | |||
#endif | |||
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, | |||
const VarNodeArray& i, | |||
const cg::OperatorNodeBase* opr); | |||
} // namespace intl | |||
} // namespace gopt | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,582 @@ | |||
/** | |||
* \file src/gopt/impl/opr_tensor_formats_config.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "./utils.h" | |||
#include "megbrain/gopt/global_layout_transform.h" | |||
#include "megbrain/opr/dnn/pooling.h" | |||
#include "megbrain/opr/imgproc.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megbrain_opr_tensor_formats_config) | |||
#define MIDOUT_B(...) \ | |||
MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) { | |||
#define MIDOUT_E \ | |||
} \ | |||
MIDOUT_END(); | |||
using namespace mgb; | |||
using namespace cg; | |||
using namespace gopt; | |||
using OprFormat = opr::ConvBias::Param::Format; | |||
namespace { | |||
template <typename Opr> | |||
struct ConvParamTrait; | |||
#define INST(_conv, _weight_idx, _bias_idx, _has_bias) \ | |||
template <> \ | |||
struct ConvParamTrait<opr::_conv> { \ | |||
static constexpr int weight_idx = _weight_idx; \ | |||
static constexpr int bias_idx = _bias_idx; \ | |||
static constexpr bool has_bias = _has_bias; \ | |||
} | |||
INST(ConvBias, 1, 2, true); | |||
INST(ConvolutionForward, 1, 0, false); | |||
INST(ConvolutionBackwardData, 0, 0, false); | |||
template <typename Opr, size_t weight_idx = ConvParamTrait<Opr>::weight_idx> | |||
static bool is_channel_wise_conv(const OperatorNodeBase* opr) { | |||
MGB_MARK_USED_VAR(ConvParamTrait<Opr>::has_bias); | |||
MGB_MARK_USED_VAR(ConvParamTrait<Opr>::bias_idx); | |||
auto&& conv = opr->cast_final_safe<Opr>(); | |||
auto format = conv.param().format; | |||
auto weight = opr->input(weight_idx); | |||
auto weight_shp = weight->shape(); | |||
if (conv.param().sparse == Opr::Param::Sparse::DENSE) | |||
return false; | |||
size_t ocpg, icpg; | |||
if (format == Opr::Param::Format::NCHW) { | |||
ocpg = weight_shp[1], icpg = weight_shp[2]; | |||
return ocpg == 1 && icpg == 1; | |||
} | |||
return false; | |||
} | |||
template <OprFormat opr_format_> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::NCHW}; | |||
config.output_tensor_formats = {TensorFormats::NCHW}; | |||
return config; | |||
} | |||
}; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW4> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW4; | |||
bool available = true; | |||
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::NCHWc4}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::CHWN4> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::CHWN4; | |||
bool available = true; | |||
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::CHWNc4}; | |||
config.output_tensor_formats = {TensorFormats::CHWNc4}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW32> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW32; | |||
bool available = true; | |||
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::NCHWc32}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc32}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NHWC> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NHWC; | |||
bool available = true; | |||
available &= | |||
opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||
opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
available &= opr->output(0)->dtype().enumv() == | |||
opr->input(0)->dtype().enumv(); | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::NHWC}; | |||
config.output_tensor_formats = {TensorFormats::NHWC}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <> | |||
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW64> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW64; | |||
bool available = true; | |||
available &= | |||
opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||
opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||
config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||
config.input_tensor_types = {TensorType::FEATURE}; | |||
available &= opr->output(0)->dtype().enumv() == | |||
opr->input(0)->dtype().enumv(); | |||
config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||
config.input_tensor_formats = {TensorFormats::NCHWc64}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc64}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <typename Opr, OprFormat opr_format_> | |||
struct ConvTensorFormatsDispatcherImpl; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW; | |||
// setup dtypes | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
// setup tensor formats | |||
if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHW, TensorFormats::NCHW, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} else { | |||
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||
if (is_channel_wise_conv<Opr>(opr)) { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHW, TensorFormats::C11RS, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} else { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHW, TensorFormats::GKCRS, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} | |||
} | |||
config.output_tensor_formats = {TensorFormats::NCHW}; | |||
return config; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NHWC; | |||
bool available = true; | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
if (i == 2) | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS32; | |||
else | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::Quantized4Asymm || | |||
opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS4; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= | |||
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, | |||
TensorFormats::NHWC, | |||
TensorFormats::NHWC}; | |||
config.output_tensor_formats = {TensorFormats::NHWC}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW4> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW4; | |||
bool available = true; | |||
// setup dtypes | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
if (i == 2) | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS32; | |||
else | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS8; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
// setup tensor formats | |||
if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4, | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||
} else { | |||
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||
if (is_channel_wise_conv<Opr>(opr)) { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc4, TensorFormats::C11RSc4, | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||
} else { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc4, TensorFormats::GKCRSc4, | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||
} | |||
} | |||
config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW32> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW32; | |||
bool available = true; | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
if (i == 2) | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS32; | |||
else | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS8; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc32, TensorFormats::NCHWc32, | |||
TensorFormats::NCHWc32, TensorFormats::NCHWc32}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc32}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW64> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW64; | |||
bool available = true; | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
if (i == 2) | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS32; | |||
else | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::Quantized4Asymm || | |||
opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS4; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= | |||
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc64, TensorFormats::NCHWc64, | |||
TensorFormats::NCHWc64, TensorFormats::NCHWc64}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc64}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <typename Opr> | |||
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::CHWN4> { | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::CHWN4; | |||
bool available = true; | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
if (i == 2) | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS32; | |||
else | |||
available &= opr->input(i)->dtype().enumv() == | |||
DTypeEnum::QuantizedS8; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||
config.input_tensor_formats = { | |||
TensorFormats::CHWNc4, TensorFormats::CHWNc4, | |||
TensorFormats::CHWNc4, TensorFormats::CHWNc4}; | |||
config.output_tensor_formats = {TensorFormats::CHWNc4}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
template <> | |||
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||
OprFormat::NCHW> { | |||
using Opr = opr::ConvolutionBackwardData; | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW; | |||
// setup dtypes | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
// setup tensor formats | |||
if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHW, TensorFormats::NCHW, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} else { | |||
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||
if (is_channel_wise_conv<Opr>(opr)) { | |||
config.input_tensor_formats = { | |||
TensorFormats::C11RS, TensorFormats::NCHW, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} else { | |||
config.input_tensor_formats = { | |||
TensorFormats::GKCRS, TensorFormats::NCHW, | |||
TensorFormats::NCHW, TensorFormats::NCHW}; | |||
} | |||
} | |||
config.output_tensor_formats = {TensorFormats::NCHW}; | |||
return config; | |||
} | |||
}; | |||
template <> | |||
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||
OprFormat::NCHW4> { | |||
using Opr = opr::ConvolutionBackwardData; | |||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||
const OperatorNodeBase* opr) { | |||
const auto& conv = opr->cast_final_safe<Opr>(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = OprFormat::NCHW4; | |||
bool available = true; | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
available &= | |||
opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||
TensorType tensor_type = | |||
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; | |||
config.input_tensor_types.emplace_back(tensor_type); | |||
} | |||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; | |||
config.input_tensor_formats = { | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4, | |||
TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||
config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||
if (available) | |||
return config; | |||
return None; | |||
} | |||
}; | |||
struct StaticData { | |||
struct KeyHash { | |||
size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const { | |||
size_t h1 = mgb::hash<Typeinfo*>(val.first); | |||
size_t h2 = | |||
std::hash<uint32_t>()(static_cast<uint32_t>(val.second)); | |||
return mgb::hash_pair_combine(h1, h2); | |||
} | |||
}; | |||
using OprTensorFormatsDispatcher = | |||
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; | |||
std::unordered_map<std::pair<Typeinfo*, OprFormat>, | |||
OprTensorFormatsDispatcher, KeyHash> | |||
typefmt2dispatcher; | |||
StaticData(); | |||
}; | |||
StaticData::StaticData() { | |||
#define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ | |||
typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ | |||
[](const OperatorNodeBase* opr) { \ | |||
MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ | |||
return ConvTensorFormatsDispatcherImpl< \ | |||
opr::_Opr, OprFormat::_fmt>::dispatch(opr); \ | |||
MIDOUT_E \ | |||
} | |||
#define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ | |||
typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ | |||
[](const OperatorNodeBase* opr) { \ | |||
MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ | |||
return OprSingleInOutTensorFormatsDispatcherImpl< \ | |||
OprFormat::_fmt>::dispatch(opr); \ | |||
MIDOUT_E \ | |||
} | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); | |||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32); | |||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64); | |||
#undef OPR_TENSOR_FORMATS_CONFIG_REG | |||
#undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG | |||
} | |||
StaticData& static_data() { | |||
static StaticData inst; | |||
return inst; | |||
} | |||
} // namespace | |||
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher* | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( | |||
Typeinfo* type, OprFormat opr_format) { | |||
auto&& typefmt2dispatcher = static_data().typefmt2dispatcher; | |||
auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format)); | |||
mgb_assert(iter != typefmt2dispatcher.end(), | |||
"cannot find OprTensorFormatsDispatcher for opr type(%s) and " | |||
"opr format(%s)", | |||
type->name, opr_format_to_string(opr_format)); | |||
return &iter->second; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,527 @@ | |||
/** | |||
* \file src/gopt/impl/profiler_impl.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "./opr_format_modifier.h" | |||
#include "./utils.h" | |||
#include "megbrain/gopt/framework.h" | |||
#include "megbrain/gopt/global_layout_transform.h" | |||
#include "megbrain/graph/event.h" | |||
#include "megbrain/opr/dnn/pooling.h" | |||
#include "megbrain/opr/imgproc.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/plugin/base.h" | |||
#include "megbrain/serialization/sereg.h" | |||
using namespace mgb; | |||
using namespace cg; | |||
using namespace opr; | |||
using namespace gopt; | |||
using ReformatKey = ReformatManager::ReformatKey; | |||
namespace { | |||
using OprFormat = Problem::OprFormat; | |||
OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) { | |||
switch (tensor_format) { | |||
case TensorFormats::NCHW: | |||
return OprFormat::NCHW; | |||
case TensorFormats::NCHWc4: | |||
return OprFormat::NCHW4; | |||
case TensorFormats::NCHWc8: | |||
return OprFormat::NCHW8; | |||
case TensorFormats::NCHWc32: | |||
return OprFormat::NCHW32; | |||
case TensorFormats::NCHWc64: | |||
return OprFormat::NCHW64; | |||
case TensorFormats::NHWC: | |||
return OprFormat::NHWC; | |||
case TensorFormats::CHWNc4: | |||
return OprFormat::CHWN4; | |||
default: | |||
mgb_throw(MegBrainError, "tensor format(%u) is not supported", | |||
static_cast<uint32_t>(tensor_format)); | |||
} | |||
} | |||
class GraphPartitionProfiler final : public PluginBase { | |||
using CompNodeEventPtr = std::unique_ptr<CompNode::Event>; | |||
public: | |||
using OprFilter = thin_function<bool(OperatorNodeBase*)>; | |||
struct OprKernEvent { | |||
CompNodeEventPtr start, end; | |||
}; | |||
GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter); | |||
~GraphPartitionProfiler() noexcept; | |||
float duration_in_usec() const; | |||
private: | |||
void record_event(CompNodeEventPtr& dest, CompNode cn) { | |||
if (dest == nullptr) | |||
dest = cn.create_event(CompNode::Event::NEED_TIMER); | |||
dest->record(); | |||
} | |||
ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event; | |||
OprFilter m_opr_filter; | |||
}; | |||
GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph, | |||
OprFilter opr_filter) | |||
: PluginBase(graph), m_opr_filter(opr_filter) { | |||
using namespace event; | |||
auto on_before_kern = [this](BeforeKernel const& event) { | |||
if (!m_opr_filter(event.opr)) | |||
return; | |||
auto evptr = &m_kern_event[event.opr].start; | |||
record_event(*evptr, event.comp_node); | |||
}; | |||
auto on_after_kern = [this](AfterKernel const& event) { | |||
if (!m_opr_filter(event.opr)) | |||
return; | |||
auto evptr = &m_kern_event[event.opr].end; | |||
record_event(*evptr, event.comp_node); | |||
}; | |||
auto&& ev = graph->event(); | |||
add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern)); | |||
add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern)); | |||
} | |||
GraphPartitionProfiler::~GraphPartitionProfiler() noexcept { | |||
auto wait = [](const CompNodeEventPtr& ev) { | |||
if (ev) | |||
ev->host_wait(); | |||
}; | |||
for (auto&& i : m_kern_event) { | |||
wait(i.second.start); | |||
wait(i.second.end); | |||
} | |||
} | |||
float GraphPartitionProfiler::duration_in_usec() const { | |||
float device_duration = 0.f; | |||
for (auto&& kern_ev : m_kern_event) { | |||
auto&& event = kern_ev.second; | |||
event.end->host_wait(); | |||
device_duration += 1e6 * event.start->elapsed_time_until(*event.end); | |||
} | |||
return device_duration; | |||
} | |||
/*! | |||
* \brief An operator that indicates its input var node is contiguous | |||
*/ | |||
// clang-format off | |||
MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{ | |||
void scn_do_execute() override {}; | |||
void init_output_static_infer_desc() override; | |||
void add_input_layout_constraint() override { | |||
input(0)->add_layout_constraint_contiguous(); | |||
} | |||
public: | |||
MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config); | |||
static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {}); | |||
}; | |||
// clang-format on | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous); | |||
MarkInputContiguous::MarkInputContiguous(VarNode* input, | |||
const OperatorNodeConfig& config) | |||
: Super(input->owner_graph(), config, "mark_contiguous", {input}) { | |||
add_input({input}); | |||
add_output(None); | |||
} | |||
SymbolVar MarkInputContiguous::make(SymbolVar input, | |||
const OperatorNodeConfig& config) { | |||
return input.insert_single_output_opr<MarkInputContiguous>(input.node(), | |||
config); | |||
} | |||
void MarkInputContiguous::init_output_static_infer_desc() { | |||
using namespace cg::static_infer; | |||
auto&& mgr = owner_graph()->static_infer_manager(); | |||
mgr.register_shape_infer(output(0), | |||
ShapeInferDesc::make_identity(input(0))); | |||
} | |||
} // namespace | |||
/* ================== ProfilerImpl =================*/ | |||
class ProfilerImpl final : public ProfilerBase { | |||
public: | |||
ProfilerImpl(int runs = 10) : m_runs{runs} {}; | |||
~ProfilerImpl() = default; | |||
ProfilingResult profile(const Problem& problem) const override; | |||
private: | |||
static constexpr float PROFILE_TIME_OUT = 1e7; | |||
/*! | |||
* \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) | |||
* | |||
* \param opr pointer to the operator node to be profiled | |||
* \param base_format the original tensor format of the operator node. | |||
* \param available_tensor_formats the available tensor formats | |||
* \return the operator node record | |||
*/ | |||
OperatorNodeRecord profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats) const; | |||
float profile_operator(const OperatorNodeBase* opr, | |||
TensorFormats base_format, | |||
TensorFormats tensor_format) const; | |||
/*! | |||
* \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.) | |||
* | |||
* \param opr pointer to the operator node to be profiled | |||
* \param base_config the tensor formats configuration of base opr format | |||
* \param config all the available configuration | |||
* \return the operator node record | |||
*/ | |||
OperatorNodeRecord profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const SmallVector<OprTensorFormatsConfiguration>& available_configs) | |||
const; | |||
float profile_operator(const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config) const; | |||
/*! | |||
* \brief profile layout transform of the var node | |||
* | |||
* \param var pointer to the var node to be profiled | |||
* \param base_format the original tensor formats in which the var node is stored | |||
* \param available_tensor_formats the available tensor formats | |||
* \param extra_attribute the extra attributes (options) of the problem | |||
* \return the var node record | |||
*/ | |||
VarNodeRecord profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats, | |||
ReformatKey::Attribute extra_attribute = | |||
ReformatKey::Attribute::DEFAULT) const; | |||
float profile_var_node(const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const; | |||
int m_runs; /// sample times of the profiler | |||
}; | |||
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats) const { | |||
OperatorNodeRecord record; | |||
record.opr = opr; | |||
auto& costs = record.costs; | |||
for (auto&& f : available_tensor_formats) { | |||
auto opr_format = tensor_formats_to_opr_format(f); | |||
costs[opr_format] = profile_operator(opr, base_format, f); | |||
} | |||
return record; | |||
} | |||
float ProfilerImpl::profile_operator(const OperatorNodeBase* opr, | |||
TensorFormats base_format, | |||
TensorFormats tensor_format) const { | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
graph->options().var_sanity_check_first_run = false; | |||
VarNodeArray new_inps(opr->input().size()); | |||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||
auto&& var = opr->input(i); | |||
auto&& cn = var->comp_node(); | |||
auto&& dtype = var->dtype(); | |||
auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||
auto aligned_tensor_shape = | |||
make_aligned_tensor_shape(var, base_format, tensor_format); | |||
dval->resize(aligned_tensor_shape); | |||
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||
new_inps[i] = aligned_var.node(); | |||
} | |||
auto new_opr = serialization::copy_opr_shallow( | |||
*opr, new_inps, opr->config(), {graph.get()}); | |||
auto y = new_opr->output(0); | |||
auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||
auto func = graph->compile({{mark, {}}}); | |||
auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; | |||
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||
std::move(filter)); | |||
for (int i = 0; i < m_runs; ++i) | |||
func->execute(); | |||
return profiler->duration_in_usec(); | |||
} | |||
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const SmallVector<OprTensorFormatsConfiguration>& available_configs) | |||
const { | |||
OperatorNodeRecord record; | |||
record.opr = opr; | |||
auto& costs = record.costs; | |||
for (auto&& i : available_configs) { | |||
costs[i.opr_format] = profile_operator(opr, base_config, i); | |||
} | |||
return record; | |||
} | |||
float ProfilerImpl::profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config) const { | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
graph->options().var_sanity_check_first_run = false; | |||
VarNodeArray new_inps(opr->input().size()); | |||
size_t i = 0; | |||
size_t nr_input_tensor = | |||
std::min(config.input_tensor_formats.size(), opr->input().size()); | |||
for (; i < nr_input_tensor; ++i) { | |||
auto&& var = opr->input(i); | |||
auto&& cn = var->comp_node(); | |||
auto&& dtype = var->dtype(); | |||
auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||
TensorShape aligned_shape; | |||
if (config.input_tensor_types[i] == TensorType::WEIGHT) { | |||
mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT); | |||
aligned_shape = make_aligned_weight_shape( | |||
var, base_config.input_tensor_formats[i], | |||
config.input_tensor_formats[i], | |||
config.output_tensor_formats[0]); | |||
} else { | |||
mgb_assert(base_config.input_tensor_types[i] == | |||
config.input_tensor_types[i]); | |||
mgb_assert(base_config.input_tensor_types[i] == | |||
TensorType::FEATURE); | |||
aligned_shape = make_aligned_tensor_shape( | |||
var, base_config.input_tensor_formats[i], | |||
config.input_tensor_formats[i]); | |||
} | |||
dval->resize(aligned_shape); | |||
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||
new_inps[i] = aligned_var.node(); | |||
} | |||
for (; i < opr->input().size(); ++i) { | |||
auto&& var = opr->input(i); | |||
auto&& cn = var->comp_node(); | |||
auto&& dtype = var->dtype(); | |||
auto hval = std::make_shared<HostTensorND>(cn, dtype); | |||
hval->resize(var->shape()); | |||
auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); }; | |||
{ | |||
auto cg = var->owner_graph(); | |||
cg->compile({{var, cb}})->execute(); | |||
} | |||
auto imm = opr::ImmutableTensor::make(*graph, *hval); | |||
new_inps[i] = imm.node(); | |||
} | |||
VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps, | |||
opr); | |||
#if 0 | |||
static const ThinHashSet<Typeinfo*> multi_algo_oprs = { | |||
opr::Convolution::typeinfo(), | |||
opr::ConvBiasForward::typeinfo(), | |||
opr::ConvolutionBackwardData::typeinfo(), | |||
opr::PoolingForward::typeinfo(), | |||
}; | |||
if (multi_algo_oprs.count(opr->dyn_typeinfo()) && | |||
!mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr())) | |||
return PROFILE_TIME_OUT; | |||
#endif | |||
auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||
auto func = graph->compile({{mark, {}}}); | |||
auto new_opr = y->owner_opr(); | |||
auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; | |||
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||
std::move(filter)); | |||
for (int i = 0; i < m_runs; ++i) | |||
func->execute(); | |||
return profiler->duration_in_usec(); | |||
} | |||
ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats, | |||
ReformatKey::Attribute attribute) const { | |||
VarNodeRecord record; | |||
record.var = var; | |||
auto& costs = record.costs; | |||
for (auto&& i : available_tensor_formats) { | |||
for (auto&& o : available_tensor_formats) { | |||
if (i == o) | |||
continue; | |||
ReformatKey key{i, o, attribute, var->dtype().enumv(), | |||
var->dtype().enumv()}; | |||
costs[{i, o}] = profile_var_node(var, base_format, key); | |||
} | |||
} | |||
return record; | |||
} | |||
float ProfilerImpl::profile_var_node(const VarNode* var, | |||
TensorFormats base_format, | |||
const ReformatKey& key) const { | |||
auto&& cn = var->comp_node(); | |||
auto&& dtype = var->dtype(); | |||
auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||
auto aligned_tensor_shape = | |||
make_aligned_tensor_shape(var, base_format, key.input_format); | |||
dval->resize(aligned_tensor_shape); | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
graph->options().var_sanity_check_first_run = false; | |||
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
var, base_format, key); | |||
auto y = builder({aligned_var.node()}); | |||
ThinHashSet<OperatorNodeBase*> set; | |||
DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); | |||
iter.add(y->owner_opr()); | |||
iter.set_visited(aligned_var.node()->owner_opr()); | |||
auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||
auto func = graph->compile({{mark, {}}}); | |||
auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; }; | |||
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||
std::move(filter)); | |||
for (int i = 0; i < m_runs; ++i) | |||
func->execute(); | |||
return profiler->duration_in_usec(); | |||
} | |||
ProfilerImpl::ProfilingResult ProfilerImpl::profile( | |||
const Problem& problem) const { | |||
ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM}; | |||
{ | |||
auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); }; | |||
DepOprIter iter{cb}; | |||
for (auto&& o : problem.graph_partition().output()) { | |||
iter.add(o->owner_opr()); | |||
} | |||
} | |||
static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = { | |||
#define cb(_Opr, _arity) {_Opr::typeinfo(), _arity} | |||
cb(Convolution, 2), | |||
cb(ConvBiasForward, 4), | |||
cb(ConvolutionBackwardData, 2), | |||
cb(PoolingForward, 1), | |||
cb(WarpPerspective, 1), | |||
cb(Resize, 1), | |||
#undef cb | |||
}; | |||
ThinHashSet<VarNode*> vars; | |||
ThinHashSet<OperatorNodeBase*> oprs; | |||
{ | |||
auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) { | |||
if (cvprop.is_const(opr)) | |||
return; | |||
oprs.insert(opr); | |||
auto find = format_aware_input_tensors.find(opr->dyn_typeinfo()); | |||
if (find == format_aware_input_tensors.end()) { | |||
for (auto&& i : opr->input()) { | |||
if (!cvprop.is_const(i)) { | |||
vars.insert(i); | |||
} | |||
} | |||
} else { | |||
size_t nr_input_tensor = | |||
std::min(find->second, opr->input().size()); | |||
for (size_t i = 0; i < nr_input_tensor; ++i) { | |||
if (!cvprop.is_const(opr->input(i))) { | |||
vars.insert(opr->input(i)); | |||
} | |||
} | |||
} | |||
vars.insert(opr->output(0)); | |||
}; | |||
DepOprIter iter{cb}; | |||
for (auto&& i : problem.graph_partition().input()) { | |||
iter.set_visited(i->owner_opr()); | |||
} | |||
for (auto&& o : problem.graph_partition().output()) { | |||
iter.add(o->owner_opr()); | |||
} | |||
} | |||
auto base_format = problem.base_format(); | |||
auto&& available_tensor_formats = problem.available_tensor_formats(); | |||
ProfilingResult profiling_result; | |||
auto& opr_record = profiling_result.opr_record; | |||
auto& var_record = profiling_result.var_record; | |||
for (auto&& var : vars) { | |||
var_record[var] = | |||
profile_var_node(var, base_format, available_tensor_formats); | |||
} | |||
for (auto&& opr : oprs) { | |||
auto&& opr_configs = problem.opr_configs(); | |||
auto find = opr_configs.find(opr->dyn_typeinfo()); | |||
if (find == opr_configs.end()) { | |||
opr_record[opr] = profile_operator(opr, base_format, | |||
available_tensor_formats); | |||
} else { | |||
auto&& dispatchers = find->second; | |||
SmallVector<OprTensorFormatsConfiguration> configs; | |||
for (const auto& item : dispatchers) { | |||
auto config = (*item.second)(opr); | |||
if (config.valid()) { | |||
configs.emplace_back(config.val()); | |||
} | |||
} | |||
auto base_config = problem.base_config(opr); | |||
opr_record[opr] = profile_operator(opr, base_config, configs); | |||
} | |||
} | |||
for (auto&& rpair : opr_record) { | |||
mgb_log_debug("%s", rpair.second.to_string().c_str()); | |||
} | |||
for (auto&& rpair : var_record) { | |||
mgb_log_debug("%s", rpair.second.to_string().c_str()); | |||
} | |||
return profiling_result; | |||
} | |||
/* ================== ProfilerBase =================*/ | |||
std::string ProfilerBase::OperatorNodeRecord::to_string() const { | |||
auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n", | |||
opr->dyn_typeinfo()->name, opr->cname()); | |||
for (auto&& i : opr->input()) { | |||
str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(), | |||
i->shape().to_string().c_str()); | |||
} | |||
str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n", | |||
opr->output(0)->cname(), | |||
opr->output(0)->shape().to_string().c_str()); | |||
for (auto&& cpair : costs) { | |||
str += ssprintf("\tformat: %s; cost:%f", | |||
opr_format_to_string(cpair.first), cpair.second); | |||
} | |||
return str; | |||
} | |||
std::string ProfilerBase::VarNodeRecord::to_string() const { | |||
auto str = ssprintf("\nvar: %s\ncosts:", var->cname()); | |||
for (auto&& cpair : costs) { | |||
auto&& formats = cpair.first; | |||
str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f", | |||
tensor_formats_to_named_tensor_shape(formats.first) | |||
.to_string() | |||
.c_str(), | |||
tensor_formats_to_named_tensor_shape(formats.second) | |||
.to_string() | |||
.c_str(), | |||
cpair.second); | |||
} | |||
return str; | |||
} | |||
std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() { | |||
return std::make_unique<ProfilerImpl>(); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { | |||
/* ============== PaddingEmitter ================= */ | |||
PaddingEmitter::EmitResult PaddingEmitter::emit() const { | |||
auto&& padshp = m_padshp; | |||
auto&& const_extent = m_const_extent; | |||
auto&& axis = m_axis; | |||
auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||
auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) { | |||
auto i = vars[0]; | |||
auto padding_shp_var = vars[1]; | |||
TensorShape shape; | |||
shape.ndim = i->shape().ndim; | |||
for (size_t ax = 0; ax < shape.ndim; ++ax) | |||
shape[ax] = 1; | |||
shape[axis] = const_extent; | |||
// avoid making a scalar lowbit tensor | |||
if (!i->dtype().is_low_bit() || const_extent != 1) | |||
shape[axis] = const_extent; | |||
else { | |||
size_t const_axis = 0; | |||
size_t new_const_extent = const_extent; | |||
for (size_t i = 0; i < padshp.ndim; ++i) { | |||
const auto& dim = padshp[i]; | |||
if (dim.extent() != Dimension::UNDETERMINED_EXTENT && | |||
dim.extent() != 1) { | |||
new_const_extent = dim.extent(); | |||
const_axis = i; | |||
break; | |||
} | |||
} | |||
mgb_assert(new_const_extent != 1, | |||
"cannot make an scalar lowbit tensor(got:%s)", | |||
i->dtype().name()); | |||
shape[const_axis] = new_const_extent; | |||
} | |||
auto host_val = | |||
std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | |||
host_val->resize(shape); | |||
@@ -13,6 +13,7 @@ | |||
#include "megbrain/gopt/reformat_manager.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megbrain/utils/arith_helper.h" | |||
#include "./utils.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) { | |||
} | |||
return x; | |||
} | |||
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | |||
switch (format) { | |||
case TensorFormats::NCHW: | |||
return {{"N"}, {"C"}, {"H"}, {"W"}}; | |||
case TensorFormats::NHWC: | |||
return {{"N"}, {"H"}, {"W"}, {"C"}}; | |||
case TensorFormats::NCHWc4: | |||
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; | |||
case TensorFormats::NCHWc8: | |||
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; | |||
case TensorFormats::NCHWc32: | |||
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; | |||
case TensorFormats::NCHWc64: | |||
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; | |||
case TensorFormats::CHWNc4: | |||
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; | |||
case TensorFormats::NHCWc4: | |||
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; | |||
case TensorFormats::KRSCk4: | |||
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||
case TensorFormats::GKRSCk4: | |||
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||
case TensorFormats::C1RSc4: | |||
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KRSCk4c4: | |||
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::GKRSCk4c4: | |||
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::KCRSk4c4: | |||
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::GKCRSk4c4: | |||
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::KCRSc4k4: | |||
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||
case TensorFormats::GKCRSc4k4: | |||
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||
case TensorFormats::C11RSc4: | |||
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KCRSc8k8: | |||
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||
case TensorFormats::GKCRSc8k8: | |||
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||
case TensorFormats::C11RSc8: | |||
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; | |||
case TensorFormats::KRSCk8: | |||
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; | |||
case TensorFormats::KCRSc4: | |||
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::GKCRSc4: | |||
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KCRS: | |||
return {{"K"}, {"C"}, {"R"}, {"S"}}; | |||
case TensorFormats::GKCRS: | |||
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; | |||
case TensorFormats::C11RS: | |||
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; | |||
default: | |||
mgb_throw(AssertionError, "invalid tensor formats(%u)", | |||
static_cast<uint32_t>(format)); | |||
} | |||
} | |||
}; // namespace | |||
// =================== ReformatManager::ReformatKey ====================*/ | |||
@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||
tensor_formats_to_named_tensor_shape(key.input_format); | |||
NamedTensorShape output_shape = | |||
tensor_formats_to_named_tensor_shape(key.output_format); | |||
size_t input_alignment, output_alignment; | |||
size_t input_channel_idx, output_channel_idx; | |||
size_t input_alignment = 0; | |||
size_t output_alignment = 0; | |||
size_t input_channel_idx = input_shape.ndim, | |||
output_channel_idx = input_shape.ndim; | |||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
if (input_shape[i].name() == Dimension::Name::C && | |||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||
break; | |||
} | |||
} | |||
mgb_assert(input_channel_idx < input_shape.ndim && | |||
output_channel_idx < input_shape.ndim, | |||
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", | |||
input_channel_idx, output_channel_idx, | |||
input_shape.to_string().c_str()); | |||
mgb_assert(input_alignment > 0 && output_alignment > 0, | |||
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", | |||
input_alignment, output_alignment, | |||
input_shape.to_string().c_str()); | |||
NamedTensorShape orig_shape = | |||
tensor_formats_to_named_tensor_shape(orig_format); | |||
size_t orig_channel = 0; | |||
@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({x}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
auto padding = std::get<0>(PaddingEmitter{ | |||
padding_shape, const_extent, input_channel_idx} | |||
.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
cur = ReformatManager::instance().get(key)({cur}); | |||
@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
const VarNode* orig_var, const ReformatKey& key, | |||
const AlignmentDesc& extra_alignment) const { | |||
size_t in_channels = 0, out_channels = 0; | |||
size_t input_channel_idx, output_channel_idx; | |||
Dimension::Name out_channel_name; | |||
Dimension::Name out_channel_name = Dimension::Name::C; | |||
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | |||
size_t input_channel_idx = input_shape.ndim, | |||
output_channel_idx = input_shape.ndim; | |||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
if (input_shape[i].name() == Dimension::Name::C && | |||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
input_shape.to_string().c_str()); | |||
} | |||
} | |||
size_t in_channel_alignment, out_channel_alignment = 1; | |||
mgb_assert(out_channel_name == Dimension::Name::K || | |||
out_channel_name == Dimension::Name::N, | |||
"invalid out channel(shp:%s)", input_shape.to_string().c_str()); | |||
mgb_assert(input_channel_idx < input_shape.ndim && | |||
output_channel_idx < input_shape.ndim, | |||
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", | |||
input_channel_idx, output_channel_idx, | |||
input_shape.to_string().c_str()); | |||
size_t in_channel_alignment = 0, out_channel_alignment = 0; | |||
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | |||
for (size_t i = 0; i < output_shape.ndim; ++i) { | |||
if (output_shape[i].name() == Dimension::Name::C && | |||
@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
out_channel_alignment = output_shape[i].stride(); | |||
} | |||
} | |||
mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0, | |||
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", | |||
in_channel_alignment, out_channel_alignment, | |||
output_shape.to_string().c_str()); | |||
size_t aligned_in_channel = | |||
divup(in_channels, in_channel_alignment) * in_channel_alignment; | |||
if (extra_alignment.name == out_channel_name) { | |||
@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({x}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
auto padding = std::get<0>(PaddingEmitter{ | |||
padding_shape, const_extent, input_channel_idx} | |||
.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
if (aligned_out_channel > out_channels) { | |||
@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({cur}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, output_channel_idx}.emit()); | |||
auto padding = std::get<0>(PaddingEmitter{ | |||
padding_shape, const_extent, output_channel_idx} | |||
.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
cur = ReformatManager::instance().get(key)({cur}); | |||
@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() { | |||
static ReformatManager inst; | |||
return inst; | |||
} | |||
TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var, | |||
TensorFormats orig_formats, | |||
TensorFormats target_formats) { | |||
using Dimension = megdnn::Dimension; | |||
static constexpr uint32_t UNDETERMINED_EXTENT = | |||
Dimension::UNDETERMINED_EXTENT; | |||
auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats); | |||
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); | |||
TensorShape oshp = var->shape(); | |||
mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim, | |||
"orig shape of var node is not compatible with tensor " | |||
"formats(var:%s;shp:%s;fmt:%s)", | |||
var->cname(), oshp.to_string().c_str(), | |||
orig_shape.to_string().c_str()); | |||
if (oshp.is_scalar()) return oshp; | |||
TensorShape tshp; | |||
ThinHashMap<Dimension::Name, int> name2dominant; | |||
for (size_t i = 0; i < orig_shape.ndim; ++i) { | |||
auto name = orig_shape[i].name(); | |||
if (orig_shape[i].extent() == UNDETERMINED_EXTENT) { | |||
auto insert = name2dominant.insert(std::make_pair(name, i)); | |||
mgb_assert(insert.second); | |||
} | |||
} | |||
tshp.ndim = target_shape.ndim; | |||
for (size_t i = 0; i < target_shape.ndim; ++i) { | |||
auto name = target_shape[i].name(); | |||
if (target_shape[i].extent() == UNDETERMINED_EXTENT) { | |||
int idx = name2dominant.at(name); | |||
bool mul = orig_shape[idx] < target_shape[i]; | |||
size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent() | |||
: (orig_shape[idx] / target_shape[i]).extent(); | |||
if (mul) | |||
tshp[i] = oshp[idx] * factor; | |||
else | |||
tshp[i] = divup(oshp[idx], factor); | |||
} else { | |||
tshp[i] = target_shape[i].extent(); | |||
} | |||
} | |||
return tshp; | |||
} | |||
TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var, | |||
TensorFormats orig_formats, | |||
TensorFormats target_formats, | |||
TensorFormats extra_formats) { | |||
auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats); | |||
auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats); | |||
using Dimension = megdnn::Dimension; | |||
static constexpr uint32_t UNDETERMINED_EXTENT = | |||
Dimension::UNDETERMINED_EXTENT; | |||
size_t out_channel_alignment = 1; | |||
for (size_t i = 0; i < extra_shape.ndim; ++i) { | |||
auto name = extra_shape[i].name(); | |||
if (name == Dimension::Name::C && | |||
extra_shape[i].extent() == UNDETERMINED_EXTENT) { | |||
out_channel_alignment = extra_shape[i].stride(); | |||
} | |||
} | |||
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); | |||
for (size_t i = 0; i < target_shape.ndim; ++i) { | |||
auto name = target_shape[i].name(); | |||
if ((name == Dimension::Name::K || name == Dimension::Name::N) && | |||
target_shape[i].extent() == UNDETERMINED_EXTENT) { | |||
size_t out_channels = tshp[i] * target_shape[i].stride(); | |||
tshp[i] = divup(out_channels, out_channel_alignment) * | |||
out_channel_alignment / target_shape[i].stride(); | |||
} | |||
} | |||
return tshp; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,105 @@ | |||
/** | |||
* \file src/gopt/impl/utils.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megbrain/gopt/global_layout_transform.h" | |||
namespace mgb { | |||
namespace gopt { | |||
static inline const char* opr_format_to_string( | |||
OprTensorFormatsConfiguration::OprFormat opr_format) { | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
#define cb(_fmt) \ | |||
case OprFormat::_fmt: \ | |||
return #_fmt | |||
switch (opr_format) { | |||
cb(NCHW); | |||
cb(NHWC); | |||
cb(NCHW4); | |||
cb(NCHW32); | |||
cb(NCHW64); | |||
cb(CHWN4); | |||
default: | |||
mgb_assert(false, "Invalid opr format(got:%u)", | |||
static_cast<uint32_t>(opr_format)); | |||
} | |||
#undef cb | |||
} | |||
static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape( | |||
TensorFormats format) { | |||
switch (format) { | |||
case TensorFormats::NCHW: | |||
return {{"N"}, {"C"}, {"H"}, {"W"}}; | |||
case TensorFormats::NHWC: | |||
return {{"N"}, {"H"}, {"W"}, {"C"}}; | |||
case TensorFormats::NCHWc4: | |||
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; | |||
case TensorFormats::NCHWc8: | |||
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; | |||
case TensorFormats::NCHWc32: | |||
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; | |||
case TensorFormats::NCHWc64: | |||
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; | |||
case TensorFormats::CHWNc4: | |||
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; | |||
case TensorFormats::NHCWc4: | |||
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; | |||
case TensorFormats::KRSCk4: | |||
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||
case TensorFormats::GKRSCk4: | |||
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||
case TensorFormats::C1RSc4: | |||
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KRSCk4c4: | |||
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::GKRSCk4c4: | |||
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::KCRSk4c4: | |||
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::GKCRSk4c4: | |||
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||
case TensorFormats::KCRSc4k4: | |||
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||
case TensorFormats::GKCRSc4k4: | |||
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||
case TensorFormats::C11RSc4: | |||
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KCRSc8k8: | |||
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||
case TensorFormats::GKCRSc8k8: | |||
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||
case TensorFormats::C11RSc8: | |||
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; | |||
case TensorFormats::KRSCk8: | |||
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; | |||
case TensorFormats::KCRSc4: | |||
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::GKCRSc4: | |||
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||
case TensorFormats::KCRS: | |||
return {{"K"}, {"C"}, {"R"}, {"S"}}; | |||
case TensorFormats::GKCRS: | |||
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; | |||
case TensorFormats::C11RS: | |||
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; | |||
default: | |||
mgb_throw(AssertionError, "invalid tensor formats(%u)", | |||
static_cast<uint32_t>(format)); | |||
} | |||
} | |||
} // namespace gopt | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,176 @@ | |||
/** | |||
* \file src/gopt/include/megbrain/gopt/global_layout_transformation.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megbrain/gopt/reformat_manager.h" | |||
#include "megbrain/gopt/subgraph_extractor.h" | |||
#include "megbrain/opr/dnn/convolution.h" | |||
namespace mgb { | |||
namespace gopt { | |||
/*! | |||
* \brief A structure that describe the data types and tensor formats | |||
* configuration of the opr format | |||
*/ | |||
struct OprTensorFormatsConfiguration { | |||
using OprFormat = opr::ConvBias::Param::Format; | |||
using OprTensorFormatsDispatcher = | |||
thin_function<Maybe<OprTensorFormatsConfiguration>( | |||
const cg::OperatorNodeBase*)>; | |||
Typeinfo* typeinfo; | |||
OprFormat opr_format; | |||
SmallVector<DTypeEnum> input_dtypes; | |||
SmallVector<DTypeEnum> output_dtypes; | |||
SmallVector<TensorFormats> input_tensor_formats; | |||
SmallVector<TensorType> input_tensor_types; | |||
SmallVector<TensorFormats> output_tensor_formats; | |||
static OprTensorFormatsDispatcher* find_dispatcher_by_type_format( | |||
Typeinfo* type, OprFormat opr_format); | |||
}; | |||
/*! | |||
* \brief A structure that describes the global layout transform problem | |||
*/ | |||
class Problem { | |||
public: | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
using OprTensorFormatsDispatcher = | |||
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; | |||
using OprConfigTrait = | |||
ThinHashMap<Typeinfo*, | |||
ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>; | |||
struct Attribute { | |||
OprFormat base_opr_format; /// the base opr format indicates that the | |||
/// network to be optimized is constructed | |||
/// in the base opr format, i.e. all the | |||
/// format aware operators (conv, conv_bias, | |||
/// deconv, pooling etc.) are built in | |||
/// this format. | |||
TensorFormats | |||
base_tensor_formats; /// the base tensor format indicates that | |||
/// all the format agnostic operators | |||
/// (like elemwise, elemwise multi type, | |||
/// typecvt etc.) are built in the base | |||
/// tensor format. | |||
}; | |||
Problem(const GraphPartition& graph_partition, | |||
const SmallVector<TensorFormats>& available_tensor_formats, | |||
const OprConfigTrait& opr_config, const Attribute& attribute) | |||
: m_graph_partition{graph_partition}, | |||
m_available_tensor_formats{available_tensor_formats}, | |||
m_opr_configs{opr_config}, | |||
m_attribute{attribute} {} | |||
~Problem() noexcept = default; | |||
const GraphPartition& graph_partition() const { return m_graph_partition; } | |||
const OprConfigTrait& opr_configs() const { return m_opr_configs; } | |||
const SmallVector<TensorFormats>& available_tensor_formats() const { | |||
return m_available_tensor_formats; | |||
} | |||
TensorFormats base_format() const { | |||
return m_attribute.base_tensor_formats; | |||
} | |||
OprTensorFormatsConfiguration base_config( | |||
const cg::OperatorNodeBase* opr) const { | |||
auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format( | |||
opr->dyn_typeinfo(), m_attribute.base_opr_format); | |||
auto rst = (*_)(opr); | |||
if (rst.valid()) | |||
return rst.val(); | |||
OprTensorFormatsConfiguration config; | |||
config.typeinfo = opr->dyn_typeinfo(); | |||
config.opr_format = m_attribute.base_opr_format; | |||
for (const auto& i : opr->input()) { | |||
config.input_dtypes.emplace_back(i->dtype().enumv()); | |||
config.input_tensor_formats.emplace_back( | |||
m_attribute.base_tensor_formats); | |||
config.input_tensor_types.emplace_back(TensorType::FEATURE); | |||
} | |||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||
config.output_tensor_formats.emplace_back( | |||
m_attribute.base_tensor_formats); | |||
return config; | |||
} | |||
private: | |||
const GraphPartition& m_graph_partition; /// the graph partition | |||
const SmallVector<TensorFormats>& | |||
m_available_tensor_formats; /// the available tensor formats, used | |||
/// for format agnostic operators (like | |||
/// elemwise, elemwise multi type, | |||
/// typecvt, etc. | |||
const OprConfigTrait& | |||
m_opr_configs; /// the available opr format configurations, used | |||
/// for format aware operators (like conv, deconv, | |||
/// conv_bias, etc. | |||
Attribute m_attribute; /// the extra attributes to describe the problem | |||
}; | |||
/*! | |||
* \brief A profiler that collects all the performance data to describe the | |||
* global layout transform problem. | |||
*/ | |||
class ProfilerBase { | |||
public: | |||
using OprFormat = Problem::OprFormat; | |||
struct OperatorNodeRecord { | |||
const cg::OperatorNodeBase* opr; ///< pointer to operator node | |||
ThinHashMap<OprFormat, float> | |||
costs; ///< costs of operator node, i.e. the elapsed device | |||
///< time of the operator node on different opr format | |||
///< (layout configuration). | |||
std::string to_string() const; | |||
}; | |||
struct VarNodeRecord { | |||
struct KeyHash { | |||
size_t operator()( | |||
const std::pair<TensorFormats, TensorFormats>& val) const { | |||
size_t h1 = | |||
std::hash<uint32_t>()(static_cast<uint32_t>(val.first)); | |||
size_t h2 = std::hash<uint32_t>()( | |||
static_cast<uint32_t>(val.second)); | |||
return mgb::hash_pair_combine(h1, h2); | |||
} | |||
}; | |||
const VarNode* var; ///< pointer to var node | |||
std::unordered_map<std::pair<TensorFormats, TensorFormats>, float, | |||
KeyHash> | |||
costs; ///< costs of var node, i.e. the elapsed | |||
///< device time of the layout transform. | |||
///< Key of the hashmap indicates the | |||
///< source tensor format and the target | |||
///< tensor format. | |||
std::string to_string() const; | |||
}; | |||
/*! | |||
* \note the profiler assumes all the input and output var node are stored | |||
* in contiguous layout in memory | |||
*/ | |||
struct ProfilingResult { | |||
/// A hashmap, that maps the operator node to the costs (device elapsed | |||
/// time) of different layouts configuration | |||
ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record; | |||
/// A hashmap, that maps the var node to the costs of layout transform | |||
ThinHashMap<VarNode*, VarNodeRecord> var_record; | |||
}; | |||
ProfilerBase() = default; | |||
virtual ~ProfilerBase() = default; | |||
virtual ProfilingResult profile(const Problem& problem) const = 0; | |||
static std::unique_ptr<ProfilerBase> make_profiler(); | |||
}; | |||
} // namespace gopt | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -80,11 +80,13 @@ private: | |||
class PaddingEmitter final : public Emitter { | |||
public: | |||
PaddingEmitter(size_t const_extent, size_t axis) | |||
: m_const_extent{const_extent}, m_axis{axis} {} | |||
PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent, | |||
size_t axis) | |||
: m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {} | |||
EmitResult emit() const override; | |||
private: | |||
megdnn::NamedTensorShape m_padshp; | |||
size_t m_const_extent, m_axis; | |||
}; | |||
@@ -17,6 +17,11 @@ | |||
namespace mgb { | |||
namespace gopt { | |||
enum class TensorType : uint32_t { | |||
FEATURE = 0, | |||
WEIGHT = 1, | |||
}; | |||
enum class TensorFormats : uint32_t { | |||
// input tensor formats | |||
NCHW = 0, ///< [N, C, H, W] | |||
@@ -116,6 +121,15 @@ public: | |||
private: | |||
ReformatCache m_cache; | |||
}; | |||
TensorShape make_aligned_tensor_shape(const VarNode* var, | |||
TensorFormats orig_formats, | |||
TensorFormats target_formats); | |||
TensorShape make_aligned_weight_shape(const VarNode* var, | |||
TensorFormats orig_formats, | |||
TensorFormats target_formats, | |||
TensorFormats extra_formats); | |||
} // namespace gopt | |||
} // namespace mgb | |||
@@ -20,6 +20,7 @@ class GraphPartition { | |||
public: | |||
using VarNodeSet = ThinHashSet<VarNode*>; | |||
using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>; | |||
class InputPlaceholder; | |||
GraphPartition() = default; | |||
@@ -45,13 +46,13 @@ private: | |||
class SubGraphExtractor { | |||
public: | |||
using OprList = ThinHashSet<Typeinfo*>; | |||
SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {}; | |||
SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {}; | |||
std::vector<GraphPartition> extract( | |||
const SymbolVarArray& endpoint_vars) const; | |||
private: | |||
class Impl; | |||
OprList m_opr_list; | |||
const OprList& m_opr_list; | |||
}; | |||
} // namespace gopt | |||
@@ -0,0 +1,429 @@ | |||
/** | |||
* \file src/gopt/test/profiler.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "./helper.h" | |||
#include "megbrain/gopt/global_layout_transform.h" | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/opr/dnn/pooling.h" | |||
#include "megbrain/opr/imgproc.h" | |||
#include "megbrain/opr/nn_int.h" | |||
#include "megbrain/serialization/serializer.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
using namespace serialization; | |||
namespace { | |||
class LayoutTransformContext : public NonCopyableObj { | |||
public: | |||
using OprList = SubGraphExtractor::OprList; | |||
using OprFormat = Problem::OprFormat; | |||
using OprConfigTrait = Problem::OprConfigTrait; | |||
LayoutTransformContext() = delete; | |||
LayoutTransformContext(OprList opr_list, | |||
SmallVector<TensorFormats> available_tensor_formats, | |||
OprConfigTrait opr_configs) | |||
: m_opr_list{std::move(opr_list)}, | |||
m_available_tensor_formats{std::move(available_tensor_formats)}, | |||
m_opr_configs{std::move(opr_configs)} {} | |||
const OprList& opr_list() const { return m_opr_list; } | |||
const SmallVector<TensorFormats>& available_tensor_formats() const { | |||
return m_available_tensor_formats; | |||
} | |||
const OprConfigTrait& opr_configs() const { return m_opr_configs; } | |||
static std::unique_ptr<LayoutTransformContext> make() { | |||
OprList opr_list = { | |||
opr::ConvBiasForward::typeinfo(), | |||
opr::ConvolutionForward::typeinfo(), | |||
opr::ConvolutionBackwardData::typeinfo(), | |||
opr::ElemwiseMultiType::typeinfo(), | |||
opr::Elemwise::typeinfo(), | |||
opr::TypeCvt::typeinfo(), | |||
opr::PoolingForward::typeinfo(), | |||
opr::WarpPerspectiveForward::typeinfo(), | |||
}; | |||
OprConfigTrait opr_configs; | |||
{ | |||
auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()]; | |||
#define cb(_fmt) \ | |||
dispatchers[OprFormat::_fmt] = \ | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||
opr::ConvBias::typeinfo(), OprFormat::_fmt); | |||
cb(NCHW4); | |||
cb(NCHW32); | |||
cb(NHWC); | |||
cb(NCHW64); | |||
cb(CHWN4); | |||
#undef cb | |||
} | |||
{ | |||
auto& dispatchers = | |||
opr_configs[opr::ConvolutionBackwardData::typeinfo()]; | |||
#define cb(_fmt) \ | |||
dispatchers[OprFormat::_fmt] = \ | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||
opr::ConvolutionBackwardData::typeinfo(), \ | |||
OprFormat::_fmt); | |||
cb(NCHW4); | |||
#undef cb | |||
} | |||
{ | |||
auto& dispatchers = | |||
opr_configs[opr::ConvolutionForward::typeinfo()]; | |||
#define cb(_fmt) \ | |||
dispatchers[OprFormat::_fmt] = \ | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||
opr::ConvolutionForward::typeinfo(), OprFormat::_fmt); | |||
cb(NCHW4); | |||
#undef cb | |||
} | |||
{ | |||
auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()]; | |||
#define cb(_fmt) \ | |||
dispatchers[OprFormat::_fmt] = \ | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||
opr::PoolingForward::typeinfo(), OprFormat::_fmt); | |||
cb(NCHW4); | |||
cb(NCHW32); | |||
cb(NHWC); | |||
cb(NCHW64); | |||
cb(CHWN4); | |||
#undef cb | |||
} | |||
{ | |||
auto& dispatchers = | |||
opr_configs[opr::WarpPerspectiveForward::typeinfo()]; | |||
#define cb(_fmt) \ | |||
dispatchers[OprFormat::_fmt] = \ | |||
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||
opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt); | |||
cb(NHWC); | |||
cb(NCHW4); | |||
cb(NCHW64); | |||
#undef cb | |||
} | |||
SmallVector<TensorFormats> available_tensor_formats = { | |||
TensorFormats::NHWC, TensorFormats::NCHWc4, | |||
TensorFormats::NCHWc32, TensorFormats::NCHWc64}; | |||
return std::make_unique<LayoutTransformContext>( | |||
std::move(opr_list), std::move(available_tensor_formats), | |||
std::move(opr_configs)); | |||
} | |||
private: | |||
OprList m_opr_list; | |||
SmallVector<TensorFormats> m_available_tensor_formats; | |||
OprConfigTrait m_opr_configs; | |||
}; | |||
}; // namespace | |||
#if MGB_CUDA | |||
#if CUDA_VERSION >= 10020 | |||
TEST(TestProfiler, Conv) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
auto ctx = LayoutTransformContext::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {64, 48, 14, 14}, | |||
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||
auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f)); | |||
auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||
param.stride_h = param.stride_w = 1; | |||
param.pad_h = param.pad_w = 1; | |||
auto c1 = opr::ConvBias::make(x, w1, b1, param, {}, | |||
OperatorNodeConfig(dtype::Quantized4Asymm( | |||
12.345f, static_cast<uint8_t>(5)))); | |||
x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f)); | |||
auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f)); | |||
auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f)); | |||
auto c2 = opr::ConvBias::make(x, w2, b2, param, {}, | |||
OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||
S strategy = S::PROFILE; | |||
gopt::modify_opr_algo_strategy_inplace({c2}, strategy); | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
SubGraphExtractor extractor(ctx->opr_list()); | |||
auto partitions = extractor.extract({c2}); | |||
ASSERT_EQ(partitions.size(), 1u); | |||
using Attribute = Problem::Attribute; | |||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||
Problem problem(partitions[0], ctx->available_tensor_formats(), | |||
ctx->opr_configs(), attribute); | |||
auto profiler = ProfilerBase::make_profiler(); | |||
auto rst = profiler->profile(problem); | |||
const auto& opr_rst = rst.opr_record; | |||
const auto& var_rst = rst.var_record; | |||
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(var_rst.count(w1.node()) == 0); | |||
EXPECT_TRUE(var_rst.count(b1.node()) == 0); | |||
EXPECT_TRUE(var_rst.count(w2.node()) == 0); | |||
EXPECT_TRUE(var_rst.count(b2.node()) == 0); | |||
} | |||
#endif | |||
TEST(TestProfiler, Deconv) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
auto ctx = LayoutTransformContext::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f)); | |||
auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); | |||
using Param = opr::ConvolutionBackwardData::Param; | |||
Param param; | |||
param.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | |||
param.stride_h = param.stride_w = 2; | |||
param.pad_h = param.pad_w = 0; | |||
auto c1 = opr::ConvolutionBackwardData::make( | |||
w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||
auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); | |||
auto c2 = opr::ConvolutionBackwardData::make( | |||
w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||
S strategy = S::PROFILE; | |||
gopt::modify_opr_algo_strategy_inplace({c2}, strategy); | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
SubGraphExtractor extractor(ctx->opr_list()); | |||
auto partitions = extractor.extract({c2}); | |||
ASSERT_EQ(partitions.size(), 1u); | |||
using Attribute = Problem::Attribute; | |||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||
Problem problem(partitions[0], ctx->available_tensor_formats(), | |||
ctx->opr_configs(), attribute); | |||
auto profiler = ProfilerBase::make_profiler(); | |||
auto rst = profiler->profile(problem); | |||
const auto& opr_rst = rst.opr_record; | |||
const auto& var_rst = rst.var_record; | |||
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(var_rst.count(w1.node()) == 0); | |||
EXPECT_TRUE(var_rst.count(w2.node()) == 0); | |||
} | |||
TEST(TestProfiler, Warp) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
auto ctx = LayoutTransformContext::make(); | |||
constexpr size_t INP_H = 10, INP_W = 10, N = 16; | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {N, 48, INP_H, INP_W}, | |||
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||
float value1 = M_PI, value2 = 0.6; | |||
auto gen_mat = [&](HostTensorND& mat) { | |||
auto ptr = mat.ptr<float>(); | |||
for (size_t i = 0; i < N; ++i) { | |||
auto rot = value1, scale = value2, sheer = value1, dy = value2, | |||
dx = value2, ky = value2, kx = value2, kb = value2; | |||
ptr[0] = ptr[4] = cos(rot) * scale; | |||
ptr[1] = -(ptr[3] = sin(rot) * scale); | |||
ptr[3] *= sheer; | |||
ptr[4] *= sheer; | |||
ptr[2] = dx; | |||
ptr[5] = dy; | |||
ptr[6] = kx; | |||
ptr[7] = ky; | |||
ptr[8] = kb; | |||
ptr += 9; | |||
} | |||
mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems()); | |||
}; | |||
auto mat_host = std::make_shared<HostTensorND>( | |||
x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32()); | |||
gen_mat(*mat_host); | |||
auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat"); | |||
TensorShape out_shp{20, 20}; | |||
auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp); | |||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||
S strategy = S::PROFILE; | |||
gopt::modify_opr_algo_strategy_inplace({w1}, strategy); | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
SubGraphExtractor extractor(ctx->opr_list()); | |||
auto partitions = extractor.extract({w1}); | |||
ASSERT_EQ(partitions.size(), 1u); | |||
using Attribute = Problem::Attribute; | |||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||
Problem problem(partitions[0], ctx->available_tensor_formats(), | |||
ctx->opr_configs(), attribute); | |||
auto profiler = ProfilerBase::make_profiler(); | |||
auto rst = profiler->profile(problem); | |||
const auto& opr_rst = rst.opr_record; | |||
const auto& var_rst = rst.var_record; | |||
EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(var_rst.count(mat.node()) == 0); | |||
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0); | |||
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0); | |||
} | |||
TEST(TestProfiler, Pooling) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
auto ctx = LayoutTransformContext::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {64, 64, 55, 55}, | |||
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||
using Param = opr::Pooling::Param; | |||
Param param; | |||
param.format = Param::Format::NCHW; | |||
auto p1 = opr::Pooling::make(x, param); | |||
x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f)); | |||
auto p2 = opr::Pooling::make(x, param); | |||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||
S strategy = S::PROFILE; | |||
gopt::modify_opr_algo_strategy_inplace({p2}, strategy); | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
SubGraphExtractor extractor(ctx->opr_list()); | |||
auto partitions = extractor.extract({p2}); | |||
ASSERT_EQ(partitions.size(), 1u); | |||
using Attribute = Problem::Attribute; | |||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||
Problem problem(partitions[0], ctx->available_tensor_formats(), | |||
ctx->opr_configs(), attribute); | |||
auto profiler = ProfilerBase::make_profiler(); | |||
auto rst = profiler->profile(problem); | |||
const auto& opr_rst = rst.opr_record; | |||
EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||
} | |||
TEST(TestProfiler, Elemwise) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
auto ctx = LayoutTransformContext::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32()); | |||
auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32()); | |||
auto c = opr::Elemwise::make({a, b}, | |||
{opr::Elemwise::Param::Mode::FUSE_ADD_RELU}); | |||
auto q4c = opr::TypeCvt::make( | |||
c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||
auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f)); | |||
auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f)); | |||
auto q8d = opr::ElemwiseMultiType::make( | |||
{q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}, | |||
OperatorNodeConfig(dtype::QuantizedS8(12.f))); | |||
auto q4d = opr::TypeCvt::make( | |||
q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3))); | |||
auto q4e = opr::ElemwiseMultiType::make( | |||
{q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD}, | |||
OperatorNodeConfig( | |||
dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4)))); | |||
using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||
SubGraphExtractor extractor(ctx->opr_list()); | |||
auto partitions = extractor.extract({q4e}); | |||
ASSERT_EQ(partitions.size(), 1u); | |||
using Attribute = Problem::Attribute; | |||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||
Problem problem(partitions[0], ctx->available_tensor_formats(), | |||
ctx->opr_configs(), attribute); | |||
auto profiler = ProfilerBase::make_profiler(); | |||
auto rst = profiler->profile(problem); | |||
const auto& opr_rst = rst.opr_record; | |||
const auto& var_rst = rst.var_record; | |||
EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0); | |||
EXPECT_TRUE(var_rst.count(a.node()) > 0); | |||
EXPECT_TRUE(var_rst.count(b.node()) > 0); | |||
EXPECT_TRUE(var_rst.count(q8a.node()) > 0); | |||
EXPECT_TRUE(var_rst.count(q8b.node()) > 0); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) { | |||
for (size_t i = 0; i < RUNS; ++i) | |||
func->execute(); | |||
double time_profiler = profiler->duration() * 1e6; | |||
printf("%f, %f\n", time_profiler, time_cuda_evt); | |||
ASSERT_EQ(time_cuda_evt, time_profiler); | |||
MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | |||
MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | |||
} | |||