GitOrigin-RevId: f9669e1ba0
release-1.6
@@ -28,7 +28,10 @@ public: | |||||
private: | private: | ||||
using TensorFormatsBitSet = uint32_t; | using TensorFormatsBitSet = uint32_t; | ||||
using State = SmallVector<TensorFormatsBitSet>; | using State = SmallVector<TensorFormatsBitSet>; | ||||
static constexpr uint32_t MAX_TENSOR_FORMATS = sizeof(TensorFormatsBitSet); | |||||
/// 1bit represents one kind of tensor formats | |||||
static constexpr uint32_t BITS_PER_BYTE = 8; | |||||
static constexpr uint32_t MAX_TENSOR_FORMATS = | |||||
sizeof(TensorFormatsBitSet) * BITS_PER_BYTE; | |||||
TensorFormatsBitSet add(TensorFormatsBitSet& set, TensorFormats fmt) { | TensorFormatsBitSet add(TensorFormatsBitSet& set, TensorFormats fmt) { | ||||
mgb_assert(static_cast<uint32_t>(fmt) < MAX_TENSOR_FORMATS); | mgb_assert(static_cast<uint32_t>(fmt) < MAX_TENSOR_FORMATS); | ||||
set |= (1 << static_cast<uint32_t>(fmt)); | set |= (1 << static_cast<uint32_t>(fmt)); | ||||
@@ -111,8 +111,6 @@ void LayoutTransformPass::apply(OptState& opt) const { | |||||
} | } | ||||
new_var = reformat({new_var}); | new_var = reformat({new_var}); | ||||
} | } | ||||
if (from != to && !new_var->shape().is_scalar()) | |||||
new_var = reformat({new_var}); | |||||
new_inp[i] = new_var; | new_inp[i] = new_var; | ||||
} | } | ||||
VarNode* new_out; | VarNode* new_out; | ||||
@@ -164,7 +162,9 @@ void LayoutTransformPass::apply(OptState& opt) const { | |||||
} | } | ||||
} else { | } else { | ||||
auto new_opr = rewriter.auto_replace_outputs(opr); | auto new_opr = rewriter.auto_replace_outputs(opr); | ||||
var2fmts[new_opr->output(0)] = base_fmt; | |||||
for (auto&& ov : new_opr->usable_output()) { | |||||
var2fmts[ov] = base_fmt; | |||||
} | |||||
} | } | ||||
}; | }; | ||||
opt.graph().iter(on_opr); | opt.graph().iter(on_opr); | ||||
@@ -245,19 +245,26 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> { | |||||
if (i == 2) | if (i == 2) | ||||
available &= opr->input(i)->dtype().enumv() == | available &= opr->input(i)->dtype().enumv() == | ||||
DTypeEnum::QuantizedS32; | DTypeEnum::QuantizedS32; | ||||
else | |||||
available &= opr->input(i)->dtype().enumv() == | |||||
DTypeEnum::Quantized4Asymm || | |||||
opr->input(i)->dtype().enumv() == | |||||
DTypeEnum::QuantizedS4; | |||||
else { | |||||
bool i4_config = opr->input(i)->dtype().enumv() == | |||||
DTypeEnum::Quantized4Asymm || | |||||
opr->input(i)->dtype().enumv() == | |||||
DTypeEnum::QuantizedS4; | |||||
bool i8_config = opr->input(i)->dtype().enumv() == | |||||
DTypeEnum::QuantizedS8; | |||||
available &= (i4_config || i8_config); | |||||
} | |||||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | ||||
TensorType tensor_type = | TensorType tensor_type = | ||||
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | ||||
config.input_tensor_types.emplace_back(tensor_type); | config.input_tensor_types.emplace_back(tensor_type); | ||||
} | } | ||||
available &= | |||||
bool i4_config = | |||||
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | ||||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | ||||
bool i8_config = | |||||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
available &= (i4_config || i8_config); | |||||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | ||||
available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | ||||
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, | config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, | ||||
@@ -496,6 +503,38 @@ struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||||
} | } | ||||
}; | }; | ||||
template <> | |||||
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||||
OprFormat::NHWC> { | |||||
using Opr = opr::ConvolutionBackwardData; | |||||
static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
const OperatorNodeBase* opr) { | |||||
const auto& conv = opr->cast_final_safe<Opr>(); | |||||
OprTensorFormatsConfiguration config; | |||||
config.typeinfo = opr->dyn_typeinfo(); | |||||
config.opr_format = OprFormat::NCHW4; | |||||
bool available = true; | |||||
for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
available &= | |||||
opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
TensorType tensor_type = | |||||
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
config.input_tensor_types.emplace_back(tensor_type); | |||||
} | |||||
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; | |||||
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, | |||||
TensorFormats::NHWC, | |||||
TensorFormats::NHWC}; | |||||
config.output_tensor_formats = {TensorFormats::NHWC}; | |||||
if (available) | |||||
return config; | |||||
return None; | |||||
} | |||||
}; | |||||
struct StaticData { | struct StaticData { | ||||
struct KeyHash { | struct KeyHash { | ||||
size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const { | size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const { | ||||
@@ -543,6 +582,7 @@ StaticData::StaticData() { | |||||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); | OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); | ||||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); | OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); | ||||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NHWC); | |||||
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); | OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); | ||||
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); | OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); | ||||
@@ -17,7 +17,6 @@ | |||||
#include "megbrain/graph/event.h" | #include "megbrain/graph/event.h" | ||||
#include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
#include "megbrain/opr/imgproc.h" | #include "megbrain/opr/imgproc.h" | ||||
#include "megbrain/opr/nn_int.h" | |||||
#include "megbrain/opr/io.h" | #include "megbrain/opr/io.h" | ||||
#include "megbrain/opr/nn_int.h" | #include "megbrain/opr/nn_int.h" | ||||
#include "megbrain/plugin/base.h" | #include "megbrain/plugin/base.h" | ||||
@@ -167,11 +166,12 @@ private: | |||||
static constexpr float PROFILE_TIME_OUT = 1e7; | static constexpr float PROFILE_TIME_OUT = 1e7; | ||||
using ReformatAttribute = ReformatKey::Attribute; | using ReformatAttribute = ReformatKey::Attribute; | ||||
/*! | /*! | ||||
* \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) | |||||
* \brief profile opr format agnostic operators (like elemwise, elemwise | |||||
* multi type, typecvt etc.) | |||||
* | * | ||||
* \param opr pointer to the operator node to be profiled | * \param opr pointer to the operator node to be profiled | ||||
* \param base_format the original tensor format of the operator node. | * \param base_format the original tensor format of the operator node. | ||||
* \param available_tensor_formats the available tensor formats | |||||
* \param available_tensor_formats the available tensor formats | |||||
* \return the operator node record | * \return the operator node record | ||||
*/ | */ | ||||
OperatorNodeRecord profile_operator( | OperatorNodeRecord profile_operator( | ||||
@@ -220,7 +220,7 @@ private: | |||||
ReformatAttribute::DEFAULT) const; | ReformatAttribute::DEFAULT) const; | ||||
float profile_var_node(const VarNode* var, TensorFormats base_format, | float profile_var_node(const VarNode* var, TensorFormats base_format, | ||||
const ReformatKey& key) const; | const ReformatKey& key) const; | ||||
int m_runs; /// sample times of the profiler | |||||
int m_runs; /// sample times of the profiler | |||||
}; | }; | ||||
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | ||||
@@ -281,10 +281,6 @@ ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | |||||
record.opr = opr; | record.opr = opr; | ||||
auto& costs = record.costs; | auto& costs = record.costs; | ||||
for (auto&& i : available_configs) { | for (auto&& i : available_configs) { | ||||
/// XXXX remove later | |||||
if (i.opr_format == OprFormat::NCHW && | |||||
opr->input(0)->dtype().enumv() != DTypeEnum::Float32) | |||||
continue; | |||||
costs[i.opr_format] = | costs[i.opr_format] = | ||||
profile_operator(opr, base_config, i, extra_attribute); | profile_operator(opr, base_config, i, extra_attribute); | ||||
} | } | ||||
@@ -403,8 +399,8 @@ float ProfilerImpl::profile_var_node(const VarNode* var, | |||||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | ||||
var, base_format, key); | var, base_format, key); | ||||
auto y = builder({aligned_var.node()}); | auto y = builder({aligned_var.node()}); | ||||
if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), | |||||
TensorFormat{})) | |||||
if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), key)) | |||||
return PROFILE_TIME_OUT; | return PROFILE_TIME_OUT; | ||||
ThinHashSet<OperatorNodeBase*> set; | ThinHashSet<OperatorNodeBase*> set; | ||||
DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); | DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); | ||||
@@ -533,6 +529,17 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold) | |||||
m_var_node_threshold{var_node_threshold} { | m_var_node_threshold{var_node_threshold} { | ||||
m_opr_filter = [this](const OperatorNodeBase* opr, | m_opr_filter = [this](const OperatorNodeBase* opr, | ||||
OperatorNodeBase* new_opr) { | OperatorNodeBase* new_opr) { | ||||
/// \note: for the considerations of performance, we skip nchw(naive) | |||||
/// kernels for conv bias on CUDA platform. to remove this later | |||||
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) { | |||||
if (conv->output(0)->comp_node().device_type() == | |||||
CompNode::DeviceType::CUDA && | |||||
conv->input(0)->dtype().category() == | |||||
DTypeCategory::QUANTIZED && | |||||
conv->param().format == OprFormat::NCHW) { | |||||
return false; | |||||
} | |||||
} | |||||
float comp1 = m_opr_footprint.get_computation( | float comp1 = m_opr_footprint.get_computation( | ||||
const_cast<OperatorNodeBase*>(opr)); | const_cast<OperatorNodeBase*>(opr)); | ||||
float comp2 = m_opr_footprint.get_computation(new_opr); | float comp2 = m_opr_footprint.get_computation(new_opr); | ||||
@@ -541,18 +548,27 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold) | |||||
return true; | return true; | ||||
}; | }; | ||||
m_var_node_filter = [this](const VarNode* var, TensorShape from, | m_var_node_filter = [this](const VarNode* var, TensorShape from, | ||||
TensorShape to, TensorFormat format) { | |||||
TensorFormat default_; | |||||
TensorLayout orig_ly, from_ly, to_ly; | |||||
if (format == default_) { | |||||
orig_ly = {var->shape(), var->dtype()}; | |||||
from_ly = {from, var->dtype()}; | |||||
to_ly = {to, var->dtype()}; | |||||
} else { | |||||
orig_ly = {var->shape(), var->dtype(), format}; | |||||
from_ly = {from, var->dtype(), format}; | |||||
to_ly = {to, var->dtype(), format}; | |||||
TensorShape to, ReformatKey key) { | |||||
/// \note: due to the alignment requirement of low-bit tensor, we skip | |||||
/// some layout transform for low-bit tensors. The skipped layout | |||||
/// transforms do not have corresponding dnn kernel and cannot be | |||||
/// implemented by tensor manip operators (like reshape, dimshuffle, | |||||
/// subtensor, etc.). | |||||
if (var->dtype().enumv() == DTypeEnum::QuantizedS4 || | |||||
var->dtype().enumv() == DTypeEnum::Quantized4Asymm) { | |||||
if (key.input_format == TensorFormats::NCHW && | |||||
key.output_format != TensorFormats::NHWC && | |||||
key.output_format != TensorFormats::NCHWc64) { | |||||
return false; | |||||
} | |||||
if (key.output_format == TensorFormats::NCHW && | |||||
key.input_format != TensorFormats::NHWC && | |||||
key.input_format != TensorFormats::NCHWc64) { | |||||
return false; | |||||
} | |||||
} | } | ||||
TensorLayout orig_ly = {var->shape(), var->dtype()}, | |||||
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; | |||||
float orig_memory = orig_ly.span().dist_byte() * 2.f; | float orig_memory = orig_ly.span().dist_byte() * 2.f; | ||||
float reformat_memory = | float reformat_memory = | ||||
from_ly.span().dist_byte() + to_ly.span().dist_byte(); | from_ly.span().dist_byte() + to_ly.span().dist_byte(); | ||||
@@ -329,10 +329,21 @@ ReformatManager::ReformatImpl ReformatManager::get( | |||||
const ReformatKey& key) const { | const ReformatKey& key) const { | ||||
using Attribute = ReformatKey::Attribute; | using Attribute = ReformatKey::Attribute; | ||||
MGB_TRY { | MGB_TRY { | ||||
auto find = m_cache.find(key); | |||||
if (find != m_cache.end()) { | |||||
auto rst = find->second; | |||||
return rst; | |||||
{ | |||||
auto find = m_cache.find(key); | |||||
if (find != m_cache.end()) { | |||||
auto rst = find->second; | |||||
return rst; | |||||
} | |||||
} | |||||
if (key.attribute == Attribute::AUTO_PADDING_NHWC) { | |||||
auto key_ = key; | |||||
key_.attribute = Attribute::DEFAULT; | |||||
auto find = m_cache.find(key_); | |||||
if (find != m_cache.end()) { | |||||
auto rst = find->second; | |||||
return rst; | |||||
} | |||||
} | } | ||||
mgb_assert(!(key.attribute & Attribute::IMAGE2D) && | mgb_assert(!(key.attribute & Attribute::IMAGE2D) && | ||||
!(key.attribute & Attribute::IC_SMALL)); | !(key.attribute & Attribute::IC_SMALL)); | ||||
@@ -222,8 +222,9 @@ public: | |||||
}; | }; | ||||
using OprFilter = thin_function<bool(const cg::OperatorNodeBase*, | using OprFilter = thin_function<bool(const cg::OperatorNodeBase*, | ||||
cg::OperatorNodeBase*)>; | cg::OperatorNodeBase*)>; | ||||
using VarNodeFilter = thin_function<bool(const VarNode*, TensorShape, | |||||
TensorShape, TensorFormat)>; | |||||
using VarNodeFilter = | |||||
thin_function<bool(const VarNode*, TensorShape, TensorShape, | |||||
ReformatManager::ReformatKey)>; | |||||
ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f); | ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f); | ||||
ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {}) | ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {}) | ||||
@@ -146,18 +146,6 @@ private: | |||||
}; | }; | ||||
MGB_DEF_ENUM_CLASS_BIT_OPR(ReformatManager::ReformatKey::Attribute); | MGB_DEF_ENUM_CLASS_BIT_OPR(ReformatManager::ReformatKey::Attribute); | ||||
// | |||||
//TensorShape make_aligned_tensor_shape( | |||||
// const VarNode* var, TensorFormats orig_formats, | |||||
// TensorFormats target_formats, | |||||
// ReformatManager::ReformatKey::Attribute extra_attribute = | |||||
// ReformatManager::ReformatKey::Attribute::DEFAULT); | |||||
// | |||||
//TensorShape make_aligned_weight_shape( | |||||
// const VarNode* var, TensorFormats orig_formats, | |||||
// TensorFormats target_formats, TensorFormats extra_formats, | |||||
// ReformatManager::ReformatKey::Attribute extra_attribute = | |||||
// ReformatManager::ReformatKey::Attribute::DEFAULT); | |||||
} // namespace gopt | } // namespace gopt | ||||
} // namespace mgb | } // namespace mgb | ||||
@@ -4104,6 +4104,79 @@ TEST(TestGoptInference, PreProcessCaseAutopadNCHW64) { | |||||
opr::RelayoutFormat::Param::Mode::NCHW_NCHW4); | opr::RelayoutFormat::Param::Mode::NCHW_NCHW4); | ||||
} | } | ||||
TEST(TestGoptInference, PreProcessCaseAutopadNHWC) { | |||||
REQUIRE_GPU(1); | |||||
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
size_t n = 2; | |||||
size_t c = 3; | |||||
size_t h = 32; | |||||
size_t w = 32; | |||||
auto host_x1 = gen({n, c, h, w}, cn); | |||||
auto x = opr::Host2DeviceCopy::make(*graph, host_x1); | |||||
auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn); | |||||
auto x_s8_fp32 = x_u8_fp32 - 128; | |||||
auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn); | |||||
auto host_val = | |||||
std::make_shared<HostTensorND>(cn, dtype::QuantizedS8(2.5f)); | |||||
TensorShape scalar{1, 1, 1, 1}; | |||||
host_val->resize(scalar); | |||||
auto ptr = host_val->raw_ptr(); | |||||
size_t size_bytes = | |||||
TensorLayout{scalar, dtype::QuantizedS8(2.5f)}.span().dist_byte(); | |||||
std::memset(ptr, 0, size_bytes); | |||||
auto padding = opr::ImmutableTensor::make(*graph, *host_val); | |||||
padding = opr::Broadcast::make(padding, {n, 1, h, w}); | |||||
auto padded_x = opr::Concat::make({x_s8, padding}, 1); | |||||
auto nhwc_x = opr::Dimshuffle::make(padded_x, {0, 2, 3, 1}); | |||||
auto weight = mkcvar("weight", {16, 3, 3, 4}, dtype::QuantizedS8(2.5f)), | |||||
bias = mkcvar("bias", {1, 1, 1, 16}, dtype::QuantizedS32(6.25f)); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NHWC; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
param.stride_h = param.stride_w = 2; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto result = | |||||
opr::ConvBias::make(nhwc_x, weight, bias, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto y = opr::TypeCvt::make(result, dtype::Float32()); | |||||
SymbolVar y_opt; | |||||
auto options = gopt::OptimizeForInferenceOptions{}; | |||||
options.enable_fuse_preprocess(); | |||||
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
graph->compile({{y_opt, {}}}) | |||||
->to_json() | |||||
->writeto_fpath(output_file( | |||||
"TestGoptInference.PreProcessCaseAutopadNHWC.json")); | |||||
HostTensorND host_y_opt, host_y; | |||||
auto func = graph->compile({make_callback_copy(y, host_y), | |||||
make_callback_copy(y_opt, host_y_opt)}); | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | |||||
ASSERT_TRUE(find_opr<opr::RelayoutFormat>(y_opt).param().mode == | |||||
opr::RelayoutFormat::Param::Mode::NCHW_NCHW4); | |||||
} | |||||
TEST(TestGoptInference, WarpAndPreProcessCase1) { | TEST(TestGoptInference, WarpAndPreProcessCase1) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255); | HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255); | ||||
@@ -10,7 +10,8 @@ | |||||
* implied. | * implied. | ||||
*/ | */ | ||||
#include "./helper.h" | |||||
#include "./network.h" | |||||
#include "megbrain/comp_node_env.h" | |||||
#include "megbrain/gopt/global_layout_transform.h" | #include "megbrain/gopt/global_layout_transform.h" | ||||
#include "megbrain/gopt/inference.h" | #include "megbrain/gopt/inference.h" | ||||
#include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
@@ -24,23 +25,145 @@ using namespace gopt; | |||||
using namespace serialization; | using namespace serialization; | ||||
#if MGB_CUDA | #if MGB_CUDA | ||||
TEST(TestLayoutTransform, Feature) { | |||||
auto inp_file = InputFile::make_fs("./feat.mdl"); | |||||
namespace { | |||||
//! find first the operator of specific type; raise exception if not found | |||||
template <typename T> | |||||
T& find_opr(SymbolVar endpoint) { | |||||
T* found = nullptr; | |||||
auto cb = [&found](cg::OperatorNodeBase* opr) { | |||||
if (!found && opr->same_type<T>()) { | |||||
found = &opr->cast_final_safe<T>(); | |||||
} | |||||
}; | |||||
cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | |||||
mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str()); | |||||
return *found; | |||||
} | |||||
auto format = GraphLoader::identify_graph_dump_format(*inp_file); | |||||
ASSERT_TRUE(format.valid()); | |||||
auto loader = GraphLoader::make(std::move(inp_file), format.val()); | |||||
template <typename T> | |||||
size_t find_opr_num(SymbolVar endpoint) { | |||||
size_t opr_num = 0; | |||||
auto cb = [&opr_num](cg::OperatorNodeBase* opr) { | |||||
if (opr->same_type<T>()) { | |||||
printf("%s, %s\n", opr->cname(), opr->dyn_typeinfo()->name); | |||||
opr_num++; | |||||
} | |||||
}; | |||||
cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | |||||
return opr_num; | |||||
} | |||||
} // namespace | |||||
TEST(TestLayoutTransform, Resnet18_QS8) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
Network network(cn); | |||||
/// batch size = 1 reduce test time | |||||
auto output = make_resnet18(network, 16, dtype::QuantizedS8{1.f}); | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
S strategy = S::PROFILE; | |||||
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); | |||||
GraphLoader::LoadConfig load_config; | |||||
load_config.comp_graph = ComputingGraph::make(); | |||||
auto&& graph_opt = load_config.comp_graph->options(); | |||||
graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity(); | |||||
graph_opt.graph_opt.enable_fuse_conv_bias_with_z(); | |||||
auto ret = loader->load(load_config, false); | |||||
HostTensorND t1; | |||||
auto func1 = network.graph->compile({make_callback_copy(output, t1)}); | |||||
func1->execute(); | |||||
using OprFormat = LayoutTransformContext::OprFormat; | |||||
using OprList = LayoutTransformContext::OprList; | |||||
using ReformatAttribute = LayoutTransformContext::ReformatAttribute; | |||||
using Attribute = LayoutTransformContext::Attribute; | |||||
OprList opr_list = { | |||||
opr::ConvBiasForward::typeinfo(), | |||||
opr::ElemwiseMultiType::typeinfo(), | |||||
opr::Elemwise::typeinfo(), | |||||
opr::TypeCvt::typeinfo(), | |||||
opr::PoolingForward::typeinfo(), | |||||
opr::WarpPerspectiveForward::typeinfo(), | |||||
}; | |||||
SmallVector<TensorFormats> available_tensor_formats = { | |||||
TensorFormats::NCHW, TensorFormats::NHWC, TensorFormats::NCHWc4, | |||||
TensorFormats::NCHWc32, TensorFormats::CHWNc4}; | |||||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | |||||
ReformatAttribute::AUTO_PADDING_NHWC}; | |||||
auto ctx = std::make_unique<LayoutTransformContext>( | |||||
std::move(opr_list), std::move(available_tensor_formats), | |||||
attribute); | |||||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||||
OprFormat::NHWC}) | |||||
.add_opr_config(opr::PoolingForward::typeinfo(), | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, | |||||
OprFormat::NHWC, OprFormat::CHWN4}); | |||||
auto profiler = ProfilerBase::make_profiler(); | |||||
std::unique_ptr<SolverBase> solver{ | |||||
new DynamicProgrammingSolver(std::move(profiler))}; | |||||
auto new_output = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>(std::move(ctx), | |||||
std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply({{output}}) | |||||
.endpoint_vars(); | |||||
auto new_out_var = new_output[0]; | |||||
/// check global layout transform pass | |||||
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var); | |||||
ASSERT_EQ(nr_dimshuffle, 3u); | |||||
/// check pass fuse conv bias with z | |||||
auto nr_elemwise_mult_type = | |||||
find_opr_num<opr::ElemwiseMultiType>(new_out_var); | |||||
ASSERT_EQ(nr_elemwise_mult_type, 4u); | |||||
/// 21 convolutions, 21 weights and 21 bias, total 42 parameters | |||||
const auto& param_merge = | |||||
find_opr<opr::MultipleDeviceTensorHolder>(new_out_var); | |||||
ASSERT_EQ(param_merge.output().size(), 42u); | |||||
/// check first conv format | |||||
const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var); | |||||
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>(); | |||||
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4); | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
HostTensorND t2; | |||||
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); | |||||
func2->execute(); | |||||
gprof.to_json_full(func2.get()) | |||||
->writeto_fpath(output_file("resnet18_qs8.json")); | |||||
/// check correct | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | |||||
TEST(TestLayoutTransform, Resnet18_QS4) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
Network network(cn); | |||||
auto output = make_resnet18(network, 16, dtype::QuantizedS4{1.f}); | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | ||||
S strategy = S::PROFILE; | S strategy = S::PROFILE; | ||||
gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy); | |||||
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); | |||||
HostTensorND t1; | |||||
auto func1 = network.graph->compile({make_callback_copy(output, t1)}); | |||||
func1->execute(); | |||||
using OprFormat = LayoutTransformContext::OprFormat; | using OprFormat = LayoutTransformContext::OprFormat; | ||||
using OprList = LayoutTransformContext::OprList; | using OprList = LayoutTransformContext::OprList; | ||||
@@ -55,74 +178,113 @@ TEST(TestLayoutTransform, Feature) { | |||||
opr::WarpPerspectiveForward::typeinfo(), | opr::WarpPerspectiveForward::typeinfo(), | ||||
}; | }; | ||||
SmallVector<TensorFormats> available_tensor_formats = { | SmallVector<TensorFormats> available_tensor_formats = { | ||||
TensorFormats::NCHWc4, TensorFormats::NCHWc32, | |||||
TensorFormats::CHWNc4}; | |||||
Attribute attribute = {OprFormat::NCHW4, TensorFormats::NCHWc4, | |||||
ReformatAttribute::DEFAULT}; | |||||
TensorFormats::NCHW, TensorFormats::NHWC, | |||||
TensorFormats::NCHWc4, TensorFormats::NCHWc32, | |||||
TensorFormats::NCHWc64, TensorFormats::CHWNc4}; | |||||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | |||||
ReformatAttribute::AUTO_PADDING_NHWC}; | |||||
auto ctx = std::make_unique<LayoutTransformContext>( | auto ctx = std::make_unique<LayoutTransformContext>( | ||||
std::move(opr_list), std::move(available_tensor_formats), | std::move(opr_list), std::move(available_tensor_formats), | ||||
attribute); | attribute); | ||||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | ||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4}) | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||||
OprFormat::NHWC, OprFormat::NCHW64}) | |||||
.add_opr_config( | .add_opr_config( | ||||
opr::PoolingForward::typeinfo(), | opr::PoolingForward::typeinfo(), | ||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4}) | |||||
.add_opr_config(opr::WarpPerspectiveForward::typeinfo(), | |||||
OprFormat::NCHW4); | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | |||||
OprFormat::NHWC, OprFormat::CHWN4}); | |||||
auto profiler = ProfilerBase::make_profiler(); | auto profiler = ProfilerBase::make_profiler(); | ||||
auto filter = [](const GraphPartition& partition) { | |||||
auto has_nchw4_conv = false; | |||||
for (auto&& opr : partition.all_oprs()) { | |||||
if (opr->dyn_typeinfo() == opr::ConvBiasForward::typeinfo()) { | |||||
auto& conv = opr->cast_final_safe<opr::ConvBiasForward>(); | |||||
if (conv.param().format == | |||||
LayoutTransformContext::OprFormat::NCHW4) { | |||||
has_nchw4_conv = true; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
return has_nchw4_conv; | |||||
}; | |||||
std::unique_ptr<SolverBase> solver{new DynamicProgrammingSolver( | |||||
std::move(profiler), std::move(filter))}; | |||||
auto new_out_vars = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>( | |||||
std::move(ctx), std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply(ret.output_var_list) | |||||
.endpoint_vars(); | |||||
auto dumper = GraphDumper::make(OutputFile::make_fs("model_opt.mgb")); | |||||
dumper->dump({new_out_vars}); | |||||
std::unique_ptr<SolverBase> solver{ | |||||
new DynamicProgrammingSolver(std::move(profiler))}; | |||||
auto new_output = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>(std::move(ctx), | |||||
std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply({{output}}) | |||||
.endpoint_vars(); | |||||
auto new_out_var = new_output[0]; | |||||
/// check global layout transform pass | |||||
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var); | |||||
ASSERT_EQ(nr_dimshuffle, 3u); | |||||
/// check pass fuse conv bias with z | |||||
auto nr_elemwise_mult_type = | |||||
find_opr_num<opr::ElemwiseMultiType>(new_out_var); | |||||
ASSERT_EQ(nr_elemwise_mult_type, 4u); | |||||
/// 21 convolutions, 21 weights and 21 bias, total 42 parameters | |||||
const auto& param_merge = | |||||
find_opr<opr::MultipleDeviceTensorHolder>(new_out_var); | |||||
ASSERT_EQ(param_merge.output().size(), 42u); | |||||
/// check first conv format | |||||
const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var); | |||||
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>(); | |||||
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NHWC); | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
HostTensorND t2; | |||||
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); | |||||
func2->execute(); | |||||
gprof.to_json_full(func2.get()) | |||||
->writeto_fpath(output_file("resnet18_qs4.json")); | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | } | ||||
TEST(TestLayoutTransform, Detection) { | |||||
auto inp_file = InputFile::make_fs("./det.mdl"); | |||||
static const char* magic = "mgbteset0"; | |||||
size_t skip_size = sizeof(magic) + sizeof(uint32_t); | |||||
char skip[skip_size]; | |||||
inp_file->read(skip, skip_size); | |||||
TEST(TestLayoutTransform, Resnet18_NCHW64) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
Network network(cn); | |||||
auto output = make_resnet18(network, 64, dtype::QuantizedS4{1.f}); | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
S strategy = S::PROFILE; | |||||
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); | |||||
auto format = GraphLoader::identify_graph_dump_format(*inp_file); | |||||
ASSERT_TRUE(format.valid()); | |||||
auto loader = GraphLoader::make(std::move(inp_file), format.val()); | |||||
HostTensorND t1; | |||||
auto func1 = network.graph->compile({make_callback_copy(output, t1)}); | |||||
func1->execute(); | |||||
GraphLoader::LoadConfig load_config; | |||||
load_config.comp_graph = ComputingGraph::make(); | |||||
auto&& graph_opt = load_config.comp_graph->options(); | |||||
graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity(); | |||||
graph_opt.graph_opt.enable_fuse_conv_bias_with_z(); | |||||
auto ret = loader->load(load_config, false); | |||||
SymbolVar new_out_var; | |||||
auto options = gopt::OptimizeForInferenceOptions{}; | |||||
options.enable_nchw64(); | |||||
unpack_vector(gopt::optimize_for_inference({output}, options), new_out_var); | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
HostTensorND t2; | |||||
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); | |||||
func2->execute(); | |||||
gprof.to_json_full(func2.get()) | |||||
->writeto_fpath(output_file("resnet18_nchw64.json")); | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | |||||
TEST(TestLayoutTransform, Detection_QS8) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
Network network(cn); | |||||
auto outputs = make_det(network, 16, dtype::QuantizedS8{1.f}); | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | ||||
S strategy = S::PROFILE; | S strategy = S::PROFILE; | ||||
gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy); | |||||
gopt::modify_opr_algo_strategy_inplace({outputs}, strategy); | |||||
using OprFormat = LayoutTransformContext::OprFormat; | using OprFormat = LayoutTransformContext::OprFormat; | ||||
using OprList = LayoutTransformContext::OprList; | using OprList = LayoutTransformContext::OprList; | ||||
@@ -130,8 +292,6 @@ TEST(TestLayoutTransform, Detection) { | |||||
using Attribute = LayoutTransformContext::Attribute; | using Attribute = LayoutTransformContext::Attribute; | ||||
OprList opr_list = { | OprList opr_list = { | ||||
opr::ConvBiasForward::typeinfo(), | opr::ConvBiasForward::typeinfo(), | ||||
opr::ConvolutionForward::typeinfo(), | |||||
opr::ConvolutionBackwardData::typeinfo(), | |||||
opr::ElemwiseMultiType::typeinfo(), | opr::ElemwiseMultiType::typeinfo(), | ||||
opr::Elemwise::typeinfo(), | opr::Elemwise::typeinfo(), | ||||
opr::TypeCvt::typeinfo(), | opr::TypeCvt::typeinfo(), | ||||
@@ -143,51 +303,228 @@ TEST(TestLayoutTransform, Detection) { | |||||
TensorFormats::NCHWc4, TensorFormats::NCHWc32, | TensorFormats::NCHWc4, TensorFormats::NCHWc32, | ||||
TensorFormats::NCHWc64, TensorFormats::CHWNc4}; | TensorFormats::NCHWc64, TensorFormats::CHWNc4}; | ||||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | ||||
ReformatAttribute::DEFAULT}; | |||||
ReformatAttribute::AUTO_PADDING_NHWC}; | |||||
auto ctx = std::make_unique<LayoutTransformContext>( | auto ctx = std::make_unique<LayoutTransformContext>( | ||||
std::move(opr_list), std::move(available_tensor_formats), | std::move(opr_list), std::move(available_tensor_formats), | ||||
attribute); | attribute); | ||||
ctx->add_opr_config( | |||||
opr::ConvBiasForward::typeinfo(), | |||||
{OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4, | |||||
OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4}) | |||||
.add_opr_config(opr::ConvolutionForward::typeinfo(), | |||||
{OprFormat::NCHW, OprFormat::NCHW4}) | |||||
.add_opr_config(opr::ConvolutionBackwardData::typeinfo(), | |||||
{OprFormat::NCHW, OprFormat::NCHW4}) | |||||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||||
OprFormat::NHWC, OprFormat::NCHW64}) | |||||
.add_opr_config( | .add_opr_config( | ||||
opr::PoolingForward::typeinfo(), | opr::PoolingForward::typeinfo(), | ||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC, | |||||
OprFormat::NCHW64, OprFormat::CHWN4}) | |||||
.add_opr_config( | |||||
opr::WarpPerspectiveForward::typeinfo(), | |||||
{OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | |||||
OprFormat::NHWC, OprFormat::CHWN4}); | |||||
auto profiler = ProfilerBase::make_profiler(); | |||||
std::unique_ptr<SolverBase> solver{ | |||||
new DynamicProgrammingSolver(std::move(profiler))}; | |||||
auto new_outputs = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>(std::move(ctx), | |||||
std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply({{outputs}}) | |||||
.endpoint_vars(); | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; | |||||
std::vector<OutputSpecItem> output_spec; | |||||
for (const auto& i : new_outputs) { | |||||
output_spec.emplace_back(OutputSpecItem{i, {}}); | |||||
} | |||||
auto func = network.graph->compile(output_spec); | |||||
func->execute(); | |||||
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs8.json")); | |||||
} | |||||
TEST(TestLayoutTransform, Detection_QS4) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver < 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
Network network(cn); | |||||
auto outputs = make_det(network, 16, dtype::QuantizedS4{1.f}); | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
S strategy = S::PROFILE; | |||||
gopt::modify_opr_algo_strategy_inplace({outputs}, strategy); | |||||
using OprFormat = LayoutTransformContext::OprFormat; | |||||
using OprList = LayoutTransformContext::OprList; | |||||
using ReformatAttribute = LayoutTransformContext::ReformatAttribute; | |||||
using Attribute = LayoutTransformContext::Attribute; | |||||
OprList opr_list = { | |||||
opr::ConvBiasForward::typeinfo(), | |||||
opr::ElemwiseMultiType::typeinfo(), | |||||
opr::Elemwise::typeinfo(), | |||||
opr::TypeCvt::typeinfo(), | |||||
opr::PoolingForward::typeinfo(), | |||||
opr::WarpPerspectiveForward::typeinfo(), | |||||
}; | |||||
SmallVector<TensorFormats> available_tensor_formats = { | |||||
TensorFormats::NCHW, TensorFormats::NHWC, | |||||
TensorFormats::NCHWc4, TensorFormats::NCHWc32, | |||||
TensorFormats::NCHWc64, TensorFormats::CHWNc4}; | |||||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | |||||
ReformatAttribute::AUTO_PADDING_NHWC}; | |||||
auto ctx = std::make_unique<LayoutTransformContext>( | |||||
std::move(opr_list), std::move(available_tensor_formats), | |||||
attribute); | |||||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||||
OprFormat::NHWC, OprFormat::NCHW64}) | |||||
.add_opr_config( | |||||
opr::PoolingForward::typeinfo(), | |||||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | |||||
OprFormat::NHWC, OprFormat::CHWN4}); | |||||
auto profiler = ProfilerBase::make_profiler(); | auto profiler = ProfilerBase::make_profiler(); | ||||
std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
auto new_out_vars = gopt::GraphOptimizer{} | |||||
.add_pass<LayoutTransformPass>( | |||||
std::move(ctx), std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply(ret.output_var_list) | |||||
.endpoint_vars(); | |||||
auto new_outputs = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>(std::move(ctx), | |||||
std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass(FuseNCHW4Int8Preprocess::make()) | |||||
.add_pass<FoldingConvBiasDimshufflePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply({{outputs}}) | |||||
.endpoint_vars(); | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; | using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; | ||||
std::vector<OutputSpecItem> outs(new_out_vars.size()); | |||||
for (size_t i = 0; i < new_out_vars.size(); ++i) { | |||||
auto cb = [](DeviceTensorND& /* d */) {}; | |||||
outs[i] = std::make_pair(new_out_vars[i], cb); | |||||
std::vector<OutputSpecItem> output_spec; | |||||
for (const auto& i : new_outputs) { | |||||
output_spec.emplace_back(OutputSpecItem{i, {}}); | |||||
} | } | ||||
GraphProfiler gprof{load_config.comp_graph.get()}; | |||||
auto func = load_config.comp_graph->compile(outs); | |||||
for (size_t i = 0; i < 10; ++i) | |||||
func->execute(); | |||||
func->wait(); | |||||
gprof.to_json_full(func.get())->writeto_fpath(output_file("det.json")); | |||||
auto func = network.graph->compile(output_spec); | |||||
func->execute(); | |||||
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs4.json")); | |||||
} | |||||
/*! | |||||
* test the performance of the solver when network is wide. | |||||
*/ | |||||
TEST(TestLayoutTransform, Wide) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
Network network(cn); | |||||
auto data = network.add_var("data", {16, 3, 64, 64}); | |||||
auto f = network.add_conv(data, 16, {3, 3}, dtype::Float32(), true, {2, 2}, | |||||
{1, 1}); | |||||
f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1}); | |||||
f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1}); | |||||
SymbolVarArray stages; | |||||
for (size_t i = 0; i < 8; ++i) { | |||||
f = f * f + f; | |||||
stages.push_back(f); | |||||
} | |||||
auto y = stages[0]; | |||||
for (size_t i = 1; i < stages.size(); ++i) { | |||||
y = y + stages[i]; | |||||
} | |||||
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
S strategy = S::PROFILE; | |||||
gopt::modify_opr_algo_strategy_inplace({y}, strategy); | |||||
using OprFormat = LayoutTransformContext::OprFormat; | |||||
using OprList = LayoutTransformContext::OprList; | |||||
using ReformatAttribute = LayoutTransformContext::ReformatAttribute; | |||||
using Attribute = LayoutTransformContext::Attribute; | |||||
OprList opr_list = { | |||||
opr::ConvBiasForward::typeinfo(), | |||||
opr::Elemwise::typeinfo(), | |||||
}; | |||||
SmallVector<TensorFormats> available_tensor_formats = {TensorFormats::NCHW, | |||||
TensorFormats::NHWC}; | |||||
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, | |||||
ReformatAttribute::DEFAULT}; | |||||
auto ctx = std::make_unique<LayoutTransformContext>( | |||||
std::move(opr_list), std::move(available_tensor_formats), | |||||
attribute); | |||||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
{OprFormat::NCHW, OprFormat::NHWC}); | |||||
auto profiler = ProfilerBase::make_profiler(); | |||||
std::unique_ptr<SolverBase> solver{ | |||||
new DynamicProgrammingSolver(std::move(profiler))}; | |||||
auto v = gopt::GraphOptimizer{} | |||||
.add_pass<FuseConvBiasNonlinPass>() | |||||
.add_pass<FuseConvBiasZPass>() | |||||
.add_pass<LayoutTransformPass>(std::move(ctx), | |||||
std::move(solver)) | |||||
.add_pass<ShuffleShuffleRemovePass>() | |||||
.add_pass<ParamFusePass>() | |||||
.add_pass<ParamMergePass>() | |||||
.apply({{y}}) | |||||
.endpoint_vars(); | |||||
const auto& sym_o = v[0]; | |||||
GraphProfiler gprof{network.graph.get()}; | |||||
auto func = network.graph->compile({{sym_o, {}}}); | |||||
func->execute(); | |||||
gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json")); | |||||
/// check global layout transform pass, no dimshuffle | |||||
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o); | |||||
ASSERT_EQ(nr_dimshuffle, 0u); | |||||
auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o); | |||||
ASSERT_EQ(nr_param_merge, 1u); | |||||
/// check first conv format | |||||
const auto& first_conv = find_opr<opr::ConvBiasForward>(sym_o); | |||||
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>(); | |||||
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW); | |||||
} | |||||
TEST(TestLayoutTransform, ElemwiseMultiType) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
Network network(cn); | |||||
auto x = network.add_var("x", {64, 64, 1, 2}); | |||||
auto y = network.add_var("y", {64, 64, 1, 2}); | |||||
x = network.add_type_cvt(x, dtype::QuantizedS4{1.f}); | |||||
y = network.add_type_cvt(y, dtype::QuantizedS4{1.f}); | |||||
auto x_ = network.add_type_cvt(x, dtype::Float32()); | |||||
auto y_ = network.add_type_cvt(y, dtype::Float32()); | |||||
auto z = network.add_elemwise({x_, y_}, dtype::Float32(), | |||||
opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
z = network.add_type_cvt(z, dtype::QuantizedS4{1.f}); | |||||
z = network.add_type_cvt(z, dtype::Float32()); | |||||
auto z2 = network.add_elemwise({x, y}, dtype::QuantizedS4{1.f}, | |||||
opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
z2 = network.add_type_cvt(z2, dtype::Float32()); | |||||
HostTensorND t1; | |||||
auto func1 = network.graph->compile({make_callback_copy(z, t1)}); | |||||
func1->execute(); | |||||
HostTensorND t3; | |||||
auto func3 = network.graph->compile({make_callback_copy(z2, t3)}); | |||||
func3->execute(); | |||||
auto alter_x = opr::RelayoutFormat::make( | |||||
x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); | |||||
auto alter_y = opr::RelayoutFormat::make( | |||||
y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); | |||||
auto alter_z = | |||||
network.add_elemwise({alter_x, alter_y}, dtype::QuantizedS4{1.f}, | |||||
opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
alter_z = opr::RelayoutFormat::make( | |||||
alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW); | |||||
alter_z = network.add_type_cvt(alter_z, dtype::Float32()); | |||||
HostTensorND t2; | |||||
auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)}); | |||||
func2->execute(); | |||||
// MGB_ASSERT_TENSOR_EQ(t1, t3); | |||||
MGB_ASSERT_TENSOR_EQ(t2, t3); | |||||
} | } | ||||
TEST(TestLayoutTransform, DetectionHead) { | TEST(TestLayoutTransform, DetectionHead) { | ||||
@@ -196,7 +533,7 @@ TEST(TestLayoutTransform, DetectionHead) { | |||||
cn.activate(); | cn.activate(); | ||||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | ||||
constexpr size_t N = 16, C = 3, H = 768, W = 1280; | |||||
constexpr size_t N = 16, C = 3, H = 736, W = 1280; | |||||
HostTensorGenerator<dtype::Uint8> gen; | HostTensorGenerator<dtype::Uint8> gen; | ||||
auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
@@ -284,20 +621,71 @@ TEST(TestLayoutTransform, DetectionHead) { | |||||
.add_pass<ParamMergePass>() | .add_pass<ParamMergePass>() | ||||
.apply(SymbolVarArray{y}) | .apply(SymbolVarArray{y}) | ||||
.endpoint_vars(); | .endpoint_vars(); | ||||
const auto& v = new_out_vars[0]; | |||||
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; | using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; | ||||
std::vector<OutputSpecItem> outs(new_out_vars.size()); | |||||
for (size_t i = 0; i < new_out_vars.size(); ++i) { | |||||
auto cb = [](DeviceTensorND& /* d */) {}; | |||||
outs[i] = std::make_pair(new_out_vars[i], cb); | |||||
std::vector<OutputSpecItem> outs; | |||||
for (const auto& i : new_out_vars) { | |||||
outs.emplace_back(OutputSpecItem{i, {}}); | |||||
} | } | ||||
GraphProfiler gprof{graph.get()}; | GraphProfiler gprof{graph.get()}; | ||||
auto func = graph->compile(outs); | auto func = graph->compile(outs); | ||||
for (size_t i = 0; i < 10; ++i) | |||||
func->execute(); | |||||
func->wait(); | |||||
func->execute(); | |||||
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_head.json")); | gprof.to_json_full(func.get())->writeto_fpath(output_file("det_head.json")); | ||||
/// check reformat | |||||
auto nr_reformat = find_opr_num<opr::RelayoutFormat>(v); | |||||
ASSERT_EQ(nr_reformat, 2u); | |||||
/// check dimshuffle | |||||
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(v); | |||||
ASSERT_EQ(nr_dimshuffle, 0u); | |||||
/// check conv_bias | |||||
auto nr_conv = find_opr_num<opr::ConvBiasForward>(v); | |||||
ASSERT_EQ(nr_conv, 2u); | |||||
/// check first conv format | |||||
const auto& first_conv = find_opr<opr::ConvBiasForward>(v); | |||||
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>(); | |||||
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4_NHWC); | |||||
} | } | ||||
#endif | #endif | ||||
TEST(TestLayoutTransform, CanonicalizeLayoutTransform) { | |||||
constexpr size_t N = 64, C = 64, H = 1, W = 1; | |||||
auto cn = CompNode::load("xpu0"); | |||||
Network network(cn); | |||||
auto x = network.add_var("x", {N, C / 4, H, W, 4}); | |||||
x = network.add_type_cvt(x, dtype::QuantizedS4{1.f}); | |||||
using NamedTensorShape = megdnn::NamedTensorShape; | |||||
auto src = NamedTensorShape::make_named_tensor_shape( | |||||
NamedTensorShape::Format::NCHW4); | |||||
auto dst = NamedTensorShape::make_named_tensor_shape( | |||||
NamedTensorShape::Format::NHWC); | |||||
auto [builder, _] = gopt::ReformatEmitter(src, dst).emit(); | |||||
MGB_MARK_USED_VAR(_); | |||||
x = SymbolVar(builder({x.node()})); | |||||
x = opr::Reshape::make(x, {N, H, W, C}); | |||||
x = network.add_type_cvt(x, dtype::Float32()); | |||||
SymbolVar another_x; | |||||
unpack_vector(gopt::GraphOptimizer{} | |||||
.add_pass<gopt::ShuffleShuffleRemovePass>() | |||||
.apply({{x}}) | |||||
.endpoint_vars(), | |||||
another_x); | |||||
const auto& astype = find_opr<opr::TypeCvt>(x); | |||||
EXPECT_TRUE(astype.input(0)->owner_opr()->dyn_typeinfo() == | |||||
opr::Host2DeviceCopy::typeinfo()); | |||||
const auto& another_astype = find_opr<opr::TypeCvt>(another_x); | |||||
EXPECT_TRUE(another_astype.input(0)->owner_opr()->dyn_typeinfo() == | |||||
opr::Reshape::typeinfo()); | |||||
HostTensorND t1; | |||||
auto func1 = network.graph->compile({make_callback_copy(x, t1)}); | |||||
func1->execute(); | |||||
HostTensorND t2; | |||||
auto func2 = network.graph->compile({make_callback_copy(another_x, t2)}); | |||||
func2->execute(); | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,237 @@ | |||||
/** | |||||
* \file src/gopt/test/network.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "./network.h" | |||||
using namespace mgb; | |||||
SymbolVar Network::add_conv(SymbolVar f, size_t output_channels, | |||||
KernSize kern_size, DType out_dtype, bool has_relu, | |||||
Stride stride, Padding padding) { | |||||
static int weight_idx = 0; | |||||
static int bias_idx = 0; | |||||
size_t input_channels = f.node()->shape()[1]; | |||||
auto weight = add_cvar( | |||||
ssprintf("w%d", weight_idx).c_str(), | |||||
{output_channels, input_channels, kern_size[0], kern_size[1]}); | |||||
auto bias = add_cvar(ssprintf("b%d", bias_idx).c_str(), | |||||
{1, output_channels, 1, 1}); | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) { | |||||
weight = add_type_cvt(weight, out_dtype); | |||||
bias = add_type_cvt(bias, dtype::QuantizedS32{1.f}); | |||||
} | |||||
opr::ConvBias::Param param; | |||||
param.stride_h = stride[0], param.stride_w = stride[1]; | |||||
param.pad_h = padding[0], param.pad_w = padding[1]; | |||||
if (has_relu) { | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
} else { | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
} | |||||
auto conv = opr::ConvBias::make(f, weight, bias, param, {}, | |||||
OperatorNodeConfig{out_dtype}); | |||||
weight_idx++; | |||||
bias_idx++; | |||||
return conv; | |||||
} | |||||
SymbolVar Network::add_deconv(SymbolVar f, size_t ratio, size_t output_channels, | |||||
DType out_dtype) { | |||||
static int weight_idx = 0; | |||||
size_t kernel = ratio * 2 - ratio % 2; | |||||
size_t pad = ratio / 2; | |||||
size_t input_channels = f.node()->shape()[1]; | |||||
auto weight = add_cvar(ssprintf("w%d", weight_idx).c_str(), | |||||
{input_channels, output_channels, kernel, kernel}); | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) { | |||||
weight = add_type_cvt(weight, out_dtype); | |||||
} | |||||
opr::ConvolutionBackwardData::Param param; | |||||
param.stride_h = param.stride_w = ratio; | |||||
param.pad_h = param.pad_w = pad; | |||||
auto deconv = opr::ConvolutionBackwardData::make( | |||||
weight, f, param, {}, OperatorNodeConfig{out_dtype}); | |||||
weight_idx++; | |||||
return deconv; | |||||
} | |||||
SymbolVar Network::add_elemwise(const SymbolVarArray inps, DType out_dtype, | |||||
opr::Elemwise::Param::Mode mode) { | |||||
using ElemMode = opr::Elemwise::Param::Mode; | |||||
using MultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
static const ThinHashMap<ElemMode, MultiMode> map = { | |||||
{ElemMode::ADD, MultiMode::QADD}, | |||||
{ElemMode::FUSE_ADD_RELU, MultiMode::QFUSE_ADD_RELU}}; | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) { | |||||
MultiMode alter_mode = map.at(mode); | |||||
return opr::ElemwiseMultiType::make(inps, {alter_mode}, | |||||
OperatorNodeConfig{out_dtype}); | |||||
} else { | |||||
return opr::Elemwise::make(inps, mode); | |||||
} | |||||
} | |||||
SymbolVar Network::add_pooling(SymbolVar f, Window window, Stride stride, | |||||
Padding padding, | |||||
opr::Pooling::Param::Mode mode) { | |||||
opr::Pooling::Param param; | |||||
param.window_h = window[0], param.window_w = window[1]; | |||||
param.stride_h = stride[0], param.stride_w = stride[1]; | |||||
param.pad_h = padding[0], param.pad_w = padding[1]; | |||||
param.mode = mode; | |||||
return opr::Pooling::make(f, param); | |||||
} | |||||
SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) { | |||||
return opr::TypeCvt::make(f, out_dtype); | |||||
} | |||||
SymbolVar mgb::create_block(Network& network, SymbolVar f_in, size_t stride, | |||||
size_t num_outputs1, bool has_proj, | |||||
DType out_dtype) { | |||||
auto proj = f_in; | |||||
if (has_proj) { | |||||
proj = network.add_conv(f_in, num_outputs1, {1, 1}, out_dtype, false, | |||||
{stride, stride}); | |||||
} | |||||
auto f = network.add_conv(f_in, num_outputs1, {3, 3}, out_dtype, true, | |||||
{stride, stride}, {1, 1}); | |||||
f = network.add_conv(f, num_outputs1, {3, 3}, out_dtype, true, {1, 1}, | |||||
{1, 1}); | |||||
f = network.add_elemwise({f, proj}, out_dtype, | |||||
opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
return f; | |||||
} | |||||
SymbolVar mgb::make_resnet18(Network& network, size_t batch, DType out_dtype) { | |||||
auto data = network.add_var("data", {batch, 4, 224, 224}); | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) | |||||
data = network.add_type_cvt(data, dtype::QuantizedS8{1.f}); | |||||
auto first = out_dtype; | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) | |||||
first = dtype::QuantizedS8{1.f}; | |||||
auto f = network.add_conv(data, 64, {7, 7}, first, true, {2, 2}, {3, 3}); | |||||
if (out_dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
f = network.add_type_cvt(f, out_dtype); | |||||
} | |||||
f = network.add_pooling(f, {3, 3}, {2, 2}, {1, 1}); | |||||
using Vector = SmallVector<size_t, 4>; | |||||
Vector stages = {2, 2, 2, 2}; | |||||
Vector mid_outputs = {64, 128, 256, 512}; | |||||
Vector enable_stride = {0, 1, 1, 1}; | |||||
for (size_t i = 0; i < 4; ++i) { | |||||
auto s = stages[i]; | |||||
auto o = mid_outputs[i]; | |||||
auto es = enable_stride[i]; | |||||
for (size_t j = 0; j < s; ++j) { | |||||
size_t stride = !es || j > 0 ? 1 : 2; | |||||
bool has_proj = j > 0 ? false : true; | |||||
f = create_block(network, f, stride, o, has_proj, out_dtype); | |||||
} | |||||
} | |||||
f = network.add_pooling(f, {7, 7}, {7, 7}, {0, 0}, | |||||
opr::Pooling::Param::Mode::AVERAGE); | |||||
f = network.add_type_cvt(f, dtype::Float32()); | |||||
return f; | |||||
} | |||||
namespace { | |||||
SymbolVarArray make_pyramids(Network& network, size_t batch, DType out_dtype) { | |||||
SymbolVarArray pyramids; | |||||
auto data = network.add_var("data", {batch, 3, 256, 256}); | |||||
data = data + (-128.f); | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) | |||||
data = network.add_type_cvt(data, dtype::QuantizedS8{1.f}); | |||||
auto first = out_dtype; | |||||
if (out_dtype.category() == DTypeCategory::QUANTIZED) | |||||
first = dtype::QuantizedS8{1.f}; | |||||
auto f = network.add_conv(data, 16, {3, 3}, first, true, {2, 2}, {1, 1}); | |||||
f = network.add_conv(f, 16, {3, 3}, first, true, {1, 1}, {1, 1}); | |||||
f = network.add_conv(f, 32, {3, 3}, first, true, {2, 2}, {1, 1}); | |||||
if (out_dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
f = network.add_type_cvt(f, out_dtype); | |||||
} | |||||
using Vector = SmallVector<size_t, 4>; | |||||
Vector stages = {3, 6, 6, 3}; | |||||
Vector mid_outputs = {32, 64, 128, 256}; | |||||
Vector enable_stride = {0, 1, 1, 1}; | |||||
for (size_t i = 0; i < 4; ++i) { | |||||
auto s = stages[i]; | |||||
auto o = mid_outputs[i]; | |||||
auto es = enable_stride[i]; | |||||
for (size_t j = 0; j < s; ++j) { | |||||
size_t stride = !es || j > 0 ? 1 : 2; | |||||
bool has_proj = j > 0 ? false : true; | |||||
f = create_block(network, f, stride, o, has_proj, out_dtype); | |||||
} | |||||
pyramids.push_back(f); | |||||
} | |||||
for (size_t i = 0; i < pyramids.size(); ++i) { | |||||
pyramids[i] = network.add_type_cvt(pyramids[i], first); | |||||
} | |||||
return pyramids; | |||||
} | |||||
SymbolVarArray fusion_pyramids_feature(Network& network, | |||||
SymbolVarArray pyramids, | |||||
size_t fpn_conv_channels) { | |||||
bool touch = false; | |||||
SymbolVar x; | |||||
SymbolVarArray fpn; | |||||
for (int i = 5; i >= 3; --i) { | |||||
auto f = network.add_conv(pyramids[i - 2], fpn_conv_channels, {1, 1}, | |||||
dtype::QuantizedS8{1.f}, false, {1, 1}, | |||||
{0, 0}); | |||||
if (!touch) { | |||||
x = f; | |||||
} else { | |||||
x = network.add_deconv(x, 2, 16, dtype::QuantizedS8{1.f}); | |||||
x = network.add_elemwise({x, f}, dtype::QuantizedS8{1.f}, | |||||
opr::Elemwise::Mode::ADD); | |||||
} | |||||
fpn.push_back(x); | |||||
} | |||||
x = fpn[0]; | |||||
for (int i = 6; i < 8; ++i) { | |||||
x = network.add_conv(x, fpn_conv_channels, {3, 3}, | |||||
dtype::QuantizedS8{1.f}, true, {2, 2}, {1, 1}); | |||||
} | |||||
return fpn; | |||||
} | |||||
} // namespace | |||||
SymbolVarArray mgb::make_det(Network& network, size_t batch, DType out_dtype) { | |||||
SymbolVarArray outputs; | |||||
auto pyramids = make_pyramids(network, batch, out_dtype); | |||||
auto fpn_hv = fusion_pyramids_feature(network, pyramids, 16); | |||||
auto fpn_plate = fusion_pyramids_feature(network, pyramids, 16); | |||||
outputs.insert(outputs.end(), fpn_hv.begin(), fpn_hv.end()); | |||||
outputs.insert(outputs.end(), fpn_plate.begin(), fpn_plate.end()); | |||||
return outputs; | |||||
} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,77 @@ | |||||
/** | |||||
* \file src/gopt/test/network.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "megbrain/test/helper.h" | |||||
#include "megbrain/gopt/framework.h" | |||||
#include "megbrain/opr/basic_arith_wrapper.h" | |||||
#include "megbrain/opr/blas.h" | |||||
#include "megbrain/opr/dnn/convolution.h" | |||||
#include "megbrain/opr/dnn/pooling.h" | |||||
#include "megbrain/opr/imgproc.h" | |||||
#include "megbrain/opr/nn_int.h" | |||||
#include "megbrain/opr/tensor_gen.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megbrain/opr/utility.h" | |||||
namespace mgb { | |||||
class Network { | |||||
private: | |||||
HostTensorGenerator<> gen; | |||||
CompNode cn; | |||||
public: | |||||
std::shared_ptr<ComputingGraph> graph = ComputingGraph::make(); | |||||
Network(CompNode cn_) : cn{cn_} {} | |||||
~Network() noexcept = default; | |||||
using KernSize = SmallVector<size_t, 2>; | |||||
using Stride = SmallVector<size_t, 2>; | |||||
using Padding = SmallVector<size_t, 2>; | |||||
SymbolVar add_var(const char* name, const TensorShape& shp = {1}) { | |||||
return opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name); | |||||
} | |||||
SymbolVar add_cvar(const char* name, const TensorShape& shp = {1}) { | |||||
return opr::SharedDeviceTensor::make(*graph, *gen(shp), cn) | |||||
.rename(name); | |||||
} | |||||
SymbolVar add_conv(SymbolVar f, size_t output_channels, KernSize kern_size, | |||||
DType out_dtype = dtype::Float32(), bool has_relu = true, | |||||
Stride stride = {1, 1}, Padding padding = {0, 0}); | |||||
SymbolVar add_deconv(SymbolVar f, size_t ratio, size_t output_channels, | |||||
DType out_dtype); | |||||
SymbolVar add_elemwise( | |||||
const SymbolVarArray inps, DType out_dtype = dtype::Float32(), | |||||
opr::Elemwise::Param::Mode mode = opr::Elemwise::Param::Mode::ADD); | |||||
using Window = SmallVector<size_t, 2>; | |||||
SymbolVar add_pooling( | |||||
SymbolVar f, Window window, Stride stride = {1, 1}, | |||||
Padding padding = {0, 0}, | |||||
opr::Pooling::Param::Mode mode = opr::Pooling::Param::Mode::MAX); | |||||
SymbolVar add_type_cvt(SymbolVar f, DType out_dtype = dtype::Float32()); | |||||
}; | |||||
SymbolVar create_block(Network& network, SymbolVar f, size_t stride, | |||||
size_t num_outputs1, bool has_proj = false, | |||||
DType out_dtype = dtype::Float32()); | |||||
SymbolVar make_resnet18(Network& network, size_t batch = 16, | |||||
DType out_dtype = dtype::Float32()); | |||||
SymbolVarArray make_det(Network& network, size_t batch = 16, | |||||
DType out_dtype = dtype::Float32()); | |||||
} // namespace mgb | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |