GitOrigin-RevId: fd0814fdb3
release-1.6
@@ -13,6 +13,7 @@ | |||||
#include "megbrain/gopt/reformat_emitter.h" | #include "megbrain/gopt/reformat_emitter.h" | ||||
#include <numeric> | #include <numeric> | ||||
#include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
#include "megbrain/opr/io.h" | |||||
using namespace mgb; | using namespace mgb; | ||||
using namespace gopt; | using namespace gopt; | ||||
@@ -243,4 +244,63 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { | |||||
} | } | ||||
return builders; | return builders; | ||||
} | } | ||||
/* ============== PaddingEmitter ================= */ | |||||
PaddingEmitter::EmitResult PaddingEmitter::emit() const { | |||||
auto&& const_extent = m_const_extent; | |||||
auto&& axis = m_axis; | |||||
auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||||
auto i = vars[0]; | |||||
auto padding_shp_var = vars[1]; | |||||
TensorShape shape; | |||||
shape.ndim = i->shape().ndim; | |||||
for (size_t ax = 0; ax < shape.ndim; ++ax) | |||||
shape[ax] = 1; | |||||
shape[axis] = const_extent; | |||||
auto host_val = | |||||
std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | |||||
host_val->resize(shape); | |||||
auto ptr = host_val->raw_ptr(); | |||||
size_t size_bytes = TensorLayout{shape, i->dtype()}.span().dist_byte(); | |||||
std::memset(ptr, 0, size_bytes); | |||||
auto padding = | |||||
opr::ImmutableTensor::make(*i->owner_graph(), *host_val); | |||||
padding = opr::Broadcast::make(padding, padding_shp_var); | |||||
auto o = opr::Concat::make({i, padding}, axis); | |||||
return o.node(); | |||||
}; | |||||
auto checker = [axis](const VarNodeArray& vars) { | |||||
mgb_assert(vars.size() == 2); | |||||
return vars[0]->shape().ndim > axis; | |||||
}; | |||||
return std::make_tuple(builder, checker); | |||||
} | |||||
/* ============== SubtensorEmitter ================= */ | |||||
SubtensorEmitter::EmitResult SubtensorEmitter::emit() const { | |||||
auto&& const_extent = m_const_extent; | |||||
auto&& axis = m_axis; | |||||
auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||||
auto i = vars[0]; | |||||
auto x = SymbolVar(i); | |||||
auto cv = [&x](int v) { return x.make_scalar(v); }; | |||||
using AIdx = opr::Subtensor::AxisIndexer; | |||||
std::vector<AIdx> index(i->shape().ndim); | |||||
for (size_t ax = 0; ax < index.size(); ++ax) { | |||||
if (ax == axis) | |||||
index[ax] = | |||||
AIdx::make_interval(ax, None, cv(const_extent), None); | |||||
else | |||||
index[ax] = AIdx::make_interval(ax, None, None, cv(1)); | |||||
} | |||||
auto o = opr::Subtensor::make(x, index); | |||||
return o.node(); | |||||
}; | |||||
auto checker = [axis](const VarNodeArray& vars) { | |||||
mgb_assert(vars.size() == 2); | |||||
return vars[0]->shape().ndim > axis; | |||||
}; | |||||
return std::make_tuple(builder, checker); | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -12,12 +12,27 @@ | |||||
#include "megbrain/gopt/reformat_manager.h" | #include "megbrain/gopt/reformat_manager.h" | ||||
#include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
#include "megbrain/utils/arith_helper.h" | |||||
using namespace mgb; | using namespace mgb; | ||||
using namespace gopt; | using namespace gopt; | ||||
using NamedTensorShape = megdnn::NamedTensorShape; | using NamedTensorShape = megdnn::NamedTensorShape; | ||||
using Dimension = megdnn::Dimension; | |||||
namespace { | namespace { | ||||
int gcd(const int& p, const int& q) { | |||||
int x = p, y = q; | |||||
while (y != 0) { | |||||
if (x < y) { | |||||
y = (y % x); | |||||
} else { | |||||
x = (x % y); | |||||
std::swap(x, y); | |||||
} | |||||
} | |||||
return x; | |||||
} | |||||
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | ||||
switch (format) { | switch (format) { | ||||
case TensorFormats::NCHW: | case TensorFormats::NCHW: | ||||
@@ -371,6 +386,170 @@ ReformatManager::ReformatImpl ReformatManager::get( | |||||
}) | }) | ||||
} | } | ||||
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||||
const VarNode* orig_var, TensorFormats orig_format, | |||||
const ReformatKey& key) const { | |||||
NamedTensorShape input_shape = | |||||
tensor_formats_to_named_tensor_shape(key.input_format); | |||||
NamedTensorShape output_shape = | |||||
tensor_formats_to_named_tensor_shape(key.output_format); | |||||
size_t input_alignment, output_alignment; | |||||
size_t input_channel_idx, output_channel_idx; | |||||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||||
if (input_shape[i].name() == Dimension::Name::C && | |||||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
input_channel_idx = i; | |||||
input_alignment = input_shape[i].stride(); | |||||
break; | |||||
} | |||||
} | |||||
for (size_t i = 0; i < output_shape.ndim; ++i) { | |||||
if (output_shape[i].name() == Dimension::Name::C && | |||||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
output_channel_idx = i; | |||||
output_alignment = output_shape[i].stride(); | |||||
break; | |||||
} | |||||
} | |||||
NamedTensorShape orig_shape = | |||||
tensor_formats_to_named_tensor_shape(orig_format); | |||||
size_t orig_channel = 0; | |||||
for (size_t i = 0; i < orig_shape.ndim; ++i) { | |||||
if (orig_shape[i].name() == Dimension::Name::C && | |||||
orig_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
orig_channel = orig_var->shape()[i] * orig_shape[i].stride(); | |||||
break; | |||||
} | |||||
} | |||||
mgb_assert(orig_channel > 0, | |||||
"incompatible NamedTensorShape for feature(got:%s)", | |||||
orig_shape.to_string().c_str()); | |||||
size_t aligned_in_channel = | |||||
divup(orig_channel, input_alignment) * input_alignment; | |||||
size_t aligned_out_channel = | |||||
divup(orig_channel, output_alignment) * output_alignment; | |||||
size_t common_alignment = input_alignment * output_alignment / | |||||
gcd(input_alignment, output_alignment); | |||||
size_t aligned_channel = | |||||
divup(orig_channel, common_alignment) * common_alignment; | |||||
auto builder = [key, aligned_channel, aligned_in_channel, | |||||
aligned_out_channel, input_shape, input_channel_idx, | |||||
output_shape, | |||||
output_channel_idx](const VarNodeArray& vars) { | |||||
VarNode *x, *cur; | |||||
x = cur = vars[0]; | |||||
if (aligned_channel > aligned_in_channel) { | |||||
auto padding_shape = input_shape; | |||||
auto&& dim = padding_shape[input_channel_idx]; | |||||
size_t const_extent = | |||||
(aligned_channel - aligned_in_channel) / dim.stride(); | |||||
padding_shape[input_channel_idx] = | |||||
Dimension(dim.name(), dim.stride(), const_extent); | |||||
auto make_shape = std::get<0>( | |||||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||||
auto padding_shp_var = make_shape({x}); | |||||
auto padding = std::get<0>( | |||||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||||
cur = padding({cur, padding_shp_var}); | |||||
} | |||||
cur = ReformatManager::instance().get(key)({cur}); | |||||
if (aligned_channel > aligned_out_channel) { | |||||
auto&& dim = output_shape[output_channel_idx]; | |||||
size_t const_extent = aligned_out_channel / dim.stride(); | |||||
auto sub = std::get<0>( | |||||
SubtensorEmitter{const_extent, output_channel_idx}.emit()); | |||||
cur = sub({cur}); | |||||
} | |||||
return cur; | |||||
}; | |||||
return builder; | |||||
} | |||||
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
const VarNode* orig_var, const ReformatKey& key, | |||||
const AlignmentDesc& extra_alignment) const { | |||||
size_t in_channels = 0, out_channels = 0; | |||||
size_t input_channel_idx, output_channel_idx; | |||||
Dimension::Name out_channel_name; | |||||
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | |||||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||||
if (input_shape[i].name() == Dimension::Name::C && | |||||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
in_channels = orig_var->shape()[i]; | |||||
input_channel_idx = i; | |||||
mgb_assert(input_shape[i].stride() == 1, | |||||
"unsupport weight format(got:%s)", | |||||
input_shape.to_string().c_str()); | |||||
} else if ((input_shape[i].name() == Dimension::Name::K || | |||||
input_shape[i].name() == Dimension::Name::N) && | |||||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
out_channels = orig_var->shape()[i]; | |||||
out_channel_name = input_shape[i].name(); | |||||
output_channel_idx = i; | |||||
mgb_assert(input_shape[i].stride() == 1, | |||||
"unsupport weight format(got:%s)", | |||||
input_shape.to_string().c_str()); | |||||
} | |||||
} | |||||
size_t in_channel_alignment, out_channel_alignment = 1; | |||||
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | |||||
for (size_t i = 0; i < output_shape.ndim; ++i) { | |||||
if (output_shape[i].name() == Dimension::Name::C && | |||||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
in_channel_alignment = output_shape[i].stride(); | |||||
} else if (output_shape[i].name() == out_channel_name && | |||||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||||
out_channel_alignment = output_shape[i].stride(); | |||||
} | |||||
} | |||||
size_t aligned_in_channel = | |||||
divup(in_channels, in_channel_alignment) * in_channel_alignment; | |||||
if (extra_alignment.name == out_channel_name) { | |||||
out_channel_alignment = | |||||
extra_alignment.alignment * out_channel_alignment / | |||||
gcd(extra_alignment.alignment, out_channel_alignment); | |||||
} | |||||
size_t aligned_out_channel = | |||||
divup(out_channels, out_channel_alignment) * out_channel_alignment; | |||||
auto builder = [key, input_shape, in_channels, input_channel_idx, | |||||
aligned_in_channel, out_channels, output_channel_idx, | |||||
aligned_out_channel](const VarNodeArray& vars) { | |||||
VarNode *x, *cur; | |||||
x = cur = vars[0]; | |||||
if (aligned_in_channel > in_channels) { | |||||
auto padding_shape = input_shape; | |||||
auto&& dim = padding_shape[input_channel_idx]; | |||||
size_t const_extent = | |||||
(aligned_in_channel - in_channels) / dim.stride(); | |||||
padding_shape[input_channel_idx] = | |||||
Dimension(dim.name(), dim.stride(), const_extent); | |||||
auto make_shape = std::get<0>( | |||||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||||
auto padding_shp_var = make_shape({x}); | |||||
auto padding = std::get<0>( | |||||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||||
cur = padding({cur, padding_shp_var}); | |||||
} | |||||
if (aligned_out_channel > out_channels) { | |||||
auto padding_shape = input_shape; | |||||
auto&& dim = padding_shape[output_channel_idx]; | |||||
size_t const_extent = | |||||
(aligned_out_channel - out_channels) / dim.stride(); | |||||
padding_shape[output_channel_idx] = | |||||
Dimension(dim.name(), dim.stride(), const_extent); | |||||
auto make_shape = std::get<0>( | |||||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||||
auto padding_shp_var = make_shape({cur}); | |||||
auto padding = std::get<0>( | |||||
PaddingEmitter{const_extent, output_channel_idx}.emit()); | |||||
cur = padding({cur, padding_shp_var}); | |||||
} | |||||
cur = ReformatManager::instance().get(key)({cur}); | |||||
return cur; | |||||
}; | |||||
return builder; | |||||
} | |||||
const ReformatManager& ReformatManager::instance() { | const ReformatManager& ReformatManager::instance() { | ||||
static ReformatManager inst; | static ReformatManager inst; | ||||
return inst; | return inst; | ||||
@@ -77,6 +77,26 @@ private: | |||||
}; | }; | ||||
UnderlyingBuilders analyze() const; | UnderlyingBuilders analyze() const; | ||||
}; | }; | ||||
class PaddingEmitter final : public Emitter { | |||||
public: | |||||
PaddingEmitter(size_t const_extent, size_t axis) | |||||
: m_const_extent{const_extent}, m_axis{axis} {} | |||||
EmitResult emit() const override; | |||||
private: | |||||
size_t m_const_extent, m_axis; | |||||
}; | |||||
class SubtensorEmitter final : public Emitter { | |||||
public: | |||||
SubtensorEmitter(size_t const_extent, size_t axis) | |||||
: m_const_extent{const_extent}, m_axis{axis} {} | |||||
EmitResult emit() const override; | |||||
private: | |||||
size_t m_const_extent, m_axis; | |||||
}; | |||||
} // namespace gopt | } // namespace gopt | ||||
} // namespace mgb | } // namespace mgb | ||||
@@ -101,12 +101,21 @@ public: | |||||
ReformatKey::Equal>; | ReformatKey::Equal>; | ||||
ReformatImpl get(const ReformatKey& key) const; | ReformatImpl get(const ReformatKey& key) const; | ||||
ReformatImpl get(ReformatKey&& key) const { return get(key); } | ReformatImpl get(ReformatKey&& key) const { return get(key); } | ||||
ReformatImpl auto_aligned_reformat_featrue(const VarNode* orig_var, | |||||
TensorFormats orig_format, | |||||
const ReformatKey& key) const; | |||||
struct AlignmentDesc { | |||||
megdnn::Dimension::Name name; | |||||
size_t alignment; | |||||
}; | |||||
ReformatImpl auto_aligned_reformat_weight( | |||||
const VarNode* orig_var, const ReformatKey& key, | |||||
const AlignmentDesc& extra_alignment = {}) const; | |||||
static const ReformatManager& instance(); | static const ReformatManager& instance(); | ||||
private: | private: | ||||
ReformatCache m_cache; | ReformatCache m_cache; | ||||
}; | }; | ||||
} // namespace gopt | } // namespace gopt | ||||
} // namespace mgb | } // namespace mgb | ||||
@@ -13,7 +13,10 @@ | |||||
#include "./helper.h" | #include "./helper.h" | ||||
#include "megbrain/gopt/reformat_manager.h" | #include "megbrain/gopt/reformat_manager.h" | ||||
#include "megbrain/graph/event.h" | |||||
#include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
#include "megbrain/plugin/base.h" | |||||
#include "megbrain/plugin/profiler.h" | |||||
using namespace mgb; | using namespace mgb; | ||||
using namespace gopt; | using namespace gopt; | ||||
@@ -168,4 +171,287 @@ TEST(TestReformatManager, InputChannelSmall) { | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | MGB_ASSERT_TENSOR_EQ(t1, t2); | ||||
} | } | ||||
TEST(TestReformatManager, AutoAlignedFeature) { | |||||
constexpr size_t N = 16, C = 22, H = 55, W = 55; | |||||
HostTensorGenerator<> gen; | |||||
using ReformatKey = ReformatManager::ReformatKey; | |||||
auto src_format = TensorFormats::NCHWc4, | |||||
dst_format = TensorFormats::NCHWc32; | |||||
ReformatKey key{src_format, dst_format}; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||||
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||||
auto mkvar = [&](const char* name, | |||||
const std::shared_ptr<HostTensorND>& host_val) { | |||||
return opr::Host2DeviceCopy::make(*graph, host_val).rename(name); | |||||
}; | |||||
auto orig_x = mkvar("orig_x", host_orig_x); | |||||
auto x = mkvar("x", host_x); | |||||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||||
orig_x.node(), TensorFormats::NCHW, key); | |||||
auto y = builder({x.node()}); | |||||
HostTensorND t; | |||||
auto func = graph->compile({make_callback_copy(y, t)}); | |||||
func->execute(); | |||||
*host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4}); | |||||
func->execute(); | |||||
*host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4}); | |||||
func->execute(); | |||||
auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32}; | |||||
ASSERT_TRUE(shp.eq_shape(t.shape())); | |||||
} | |||||
TEST(TestReformatManager, AutoAlignedFeatureB4) { | |||||
constexpr size_t N = 16, C = 94, H = 55, W = 55; | |||||
HostTensorGenerator<> gen; | |||||
using ReformatKey = ReformatManager::ReformatKey; | |||||
auto src_format = TensorFormats::NCHWc4, | |||||
dst_format = TensorFormats::NCHWc64; | |||||
ReformatKey key{src_format, dst_format}; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||||
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||||
auto mkvar = [&](const char* name, | |||||
const std::shared_ptr<HostTensorND>& host_val, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, host_val).rename(name), | |||||
dtype); | |||||
}; | |||||
auto orig_x = mkvar("orig_x", host_orig_x, | |||||
dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(8))); | |||||
auto x = mkvar("x", host_x, | |||||
dtype::Quantized4Asymm(25.f, static_cast<uint8_t>(4))); | |||||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||||
orig_x.node(), TensorFormats::NCHW, key); | |||||
auto y = builder({x.node()}); | |||||
HostTensorND t; | |||||
auto func = graph->compile({make_callback_copy(y, t)}); | |||||
func->execute(); | |||||
} | |||||
TEST(TestReformatManager, AutoAlignedWeight) { | |||||
constexpr size_t K = 32, C = 32, R = 3, S = 3; | |||||
HostTensorGenerator<> gen; | |||||
using ReformatKey = ReformatManager::ReformatKey; | |||||
auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64; | |||||
ReformatKey key{src_format, dst_format}; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkvar = [&](const char* name, const TensorShape& shp) { | |||||
return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name); | |||||
}; | |||||
auto w = mkvar("w", {K, C, R, S}); | |||||
auto builder = ReformatManager::instance().auto_aligned_reformat_weight( | |||||
w.node(), key, | |||||
ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64}); | |||||
auto y = builder({w.node()}); | |||||
HostTensorND t; | |||||
auto func = graph->compile({make_callback_copy(y, t)}); | |||||
func->execute(); | |||||
} | |||||
#if MGB_CUDA | |||||
#include "megbrain/comp_node_env.h" | |||||
namespace { | |||||
class ReformatProfiler : public PluginBase { | |||||
using CompNodeEventPtr = std::unique_ptr<CompNode::Event>; | |||||
public: | |||||
class MarkInputContiguous; | |||||
ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start, | |||||
cg::OperatorNodeBase* opr_end); | |||||
~ReformatProfiler() noexcept; | |||||
double duration() const; | |||||
private: | |||||
CompNodeEventPtr m_start, m_end; | |||||
cg::OperatorNodeBase *m_opr_start, *m_opr_end; | |||||
}; | |||||
ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph, | |||||
cg::OperatorNodeBase* opr_start, | |||||
cg::OperatorNodeBase* opr_end) | |||||
: PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) { | |||||
using namespace cg::event; | |||||
auto on_reformat_start = [this](BeforeKernel const& event) { | |||||
auto opr = event.opr; | |||||
if (opr != m_opr_start) | |||||
return; | |||||
if (m_start == nullptr) { | |||||
m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||||
} | |||||
m_start->record(); | |||||
}; | |||||
auto on_reformat_end = [this](AfterKernel const& event) { | |||||
auto opr = event.opr; | |||||
if (opr != m_opr_end) | |||||
return; | |||||
if (m_end == nullptr) { | |||||
m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||||
} | |||||
m_end->record(); | |||||
}; | |||||
auto&& ev = graph->event(); | |||||
add_event_handler(ev.register_receiver<BeforeKernel>(on_reformat_start)); | |||||
add_event_handler(ev.register_receiver<AfterKernel>(on_reformat_end)); | |||||
} | |||||
ReformatProfiler::~ReformatProfiler() noexcept { | |||||
if (m_start) | |||||
m_start->host_wait(); | |||||
if (m_end) | |||||
m_end->host_wait(); | |||||
} | |||||
double ReformatProfiler::duration() const { | |||||
mgb_assert(m_end); | |||||
m_end->host_wait(); | |||||
return m_start->elapsed_time_until(*m_end) - | |||||
m_start->elapsed_time_until(*m_start); | |||||
} | |||||
MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous, | |||||
cg::SingleCNOperatorNodeBase) // { | |||||
void scn_do_execute() override{}; | |||||
void init_output_static_infer_desc() override; | |||||
void add_input_layout_constraint() override; | |||||
public: | |||||
MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config); | |||||
static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {}); | |||||
}; // namespace | |||||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous); | |||||
ReformatProfiler::MarkInputContiguous::MarkInputContiguous( | |||||
VarNode* node, const OperatorNodeConfig& config) | |||||
: Super(node->owner_graph(), config, "mark_contiguous", {node}) { | |||||
add_input({node}); | |||||
add_output(None); | |||||
} | |||||
SymbolVar ReformatProfiler::MarkInputContiguous::make( | |||||
SymbolVar node, const OperatorNodeConfig& config) { | |||||
return node.insert_single_output_opr<MarkInputContiguous>(node.node(), | |||||
config); | |||||
} | |||||
void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() { | |||||
using namespace cg::static_infer; | |||||
auto&& mgr = owner_graph()->static_infer_manager(); | |||||
mgr.register_shape_infer(output(0), | |||||
ShapeInferDesc::make_identity(input(0))); | |||||
} | |||||
void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() { | |||||
input(0)->add_layout_constraint_contiguous(); | |||||
} | |||||
class CUTimer { | |||||
public: | |||||
CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1) | |||||
: m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} { | |||||
reset(); | |||||
} | |||||
void reset() { | |||||
m_started = false; | |||||
m_stopped = false; | |||||
} | |||||
void start() { | |||||
mgb_assert(!m_started); | |||||
mgb_assert(!m_stopped); | |||||
m_started = true; | |||||
cudaEventRecord(m_evt0, m_stream); | |||||
} | |||||
void stop() { | |||||
mgb_assert(m_started); | |||||
mgb_assert(!m_stopped); | |||||
m_stopped = true; | |||||
cudaEventRecord(m_evt1, m_stream); | |||||
} | |||||
size_t get_time_in_us() const { | |||||
cudaStreamSynchronize(m_stream); | |||||
float t = -1; | |||||
cudaEventElapsedTime(&t, m_evt0, m_evt1); | |||||
return static_cast<size_t>(t * 1e3); | |||||
} | |||||
private: | |||||
bool m_started, m_stopped; | |||||
size_t m_start_point, m_stop_point; | |||||
cudaStream_t& m_stream; | |||||
cudaEvent_t &m_evt0, &m_evt1; | |||||
}; | |||||
} // namespace | |||||
TEST(TestReformatManager, AutoAlignedFeatureProfiling) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpux"); | |||||
using ReformatKey = ReformatManager::ReformatKey; | |||||
auto dtype = dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(4)); | |||||
HostTensorND hval(cn, dtype); | |||||
constexpr size_t N = 16, C = 18, H = 55, W = 55; | |||||
hval.resize({N, (C + 63) / 64, H, W, 64}); | |||||
std::shared_ptr<DeviceTensorND> dval = | |||||
std::make_shared<DeviceTensorND>(cn, dtype); | |||||
dval->copy_from(hval).sync(); | |||||
std::shared_ptr<DeviceTensorND> dprime = | |||||
std::make_shared<DeviceTensorND>(cn, dtype); | |||||
dprime->resize({N, C, H, W}); | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
graph->options().var_sanity_check_first_run = false; | |||||
auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||||
auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime); | |||||
ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW}; | |||||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||||
xprime.node(), TensorFormats::NCHW, key); | |||||
auto y = builder({x.node()}); | |||||
auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y)); | |||||
auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); }; | |||||
auto output_spec = std::make_pair(mark, cb); | |||||
auto func = graph->compile({output_spec}); | |||||
static constexpr size_t RUNS = 100; | |||||
cn.activate(); | |||||
auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream; | |||||
cudaEvent_t evt0; | |||||
cudaEvent_t evt1; | |||||
MGB_CUDA_CHECK(cudaEventCreate(&evt0)); | |||||
MGB_CUDA_CHECK(cudaEventCreate(&evt1)); | |||||
CUTimer timer(stream, evt0, evt1); | |||||
timer.start(); | |||||
for (size_t i = 0; i < RUNS; ++i) | |||||
func->execute(); | |||||
timer.stop(); | |||||
double time_cuda_evt = timer.get_time_in_us() / static_cast<double>(RUNS); | |||||
OperatorNodeBase* start = x.node()->owner_opr(); | |||||
OperatorNodeBase* end = y->owner_opr(); | |||||
std::unique_ptr<ReformatProfiler> profiler = | |||||
std::make_unique<ReformatProfiler>(graph.get(), start, end); | |||||
ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W})); | |||||
for (size_t i = 0; i < RUNS; ++i) | |||||
func->execute(); | |||||
double time_profiler = profiler->duration() * 1e6; | |||||
printf("%f, %f\n", time_profiler, time_cuda_evt); | |||||
ASSERT_EQ(time_cuda_evt, time_profiler); | |||||
MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | |||||
MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | |||||
} | |||||
#endif | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |