GitOrigin-RevId: fd0814fdb3
release-1.6
@@ -13,6 +13,7 @@ | |||
#include "megbrain/gopt/reformat_emitter.h" | |||
#include <numeric> | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megbrain/opr/io.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
@@ -243,4 +244,63 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { | |||
} | |||
return builders; | |||
} | |||
/* ============== PaddingEmitter ================= */ | |||
PaddingEmitter::EmitResult PaddingEmitter::emit() const { | |||
auto&& const_extent = m_const_extent; | |||
auto&& axis = m_axis; | |||
auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||
auto i = vars[0]; | |||
auto padding_shp_var = vars[1]; | |||
TensorShape shape; | |||
shape.ndim = i->shape().ndim; | |||
for (size_t ax = 0; ax < shape.ndim; ++ax) | |||
shape[ax] = 1; | |||
shape[axis] = const_extent; | |||
auto host_val = | |||
std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | |||
host_val->resize(shape); | |||
auto ptr = host_val->raw_ptr(); | |||
size_t size_bytes = TensorLayout{shape, i->dtype()}.span().dist_byte(); | |||
std::memset(ptr, 0, size_bytes); | |||
auto padding = | |||
opr::ImmutableTensor::make(*i->owner_graph(), *host_val); | |||
padding = opr::Broadcast::make(padding, padding_shp_var); | |||
auto o = opr::Concat::make({i, padding}, axis); | |||
return o.node(); | |||
}; | |||
auto checker = [axis](const VarNodeArray& vars) { | |||
mgb_assert(vars.size() == 2); | |||
return vars[0]->shape().ndim > axis; | |||
}; | |||
return std::make_tuple(builder, checker); | |||
} | |||
/* ============== SubtensorEmitter ================= */ | |||
SubtensorEmitter::EmitResult SubtensorEmitter::emit() const { | |||
auto&& const_extent = m_const_extent; | |||
auto&& axis = m_axis; | |||
auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||
auto i = vars[0]; | |||
auto x = SymbolVar(i); | |||
auto cv = [&x](int v) { return x.make_scalar(v); }; | |||
using AIdx = opr::Subtensor::AxisIndexer; | |||
std::vector<AIdx> index(i->shape().ndim); | |||
for (size_t ax = 0; ax < index.size(); ++ax) { | |||
if (ax == axis) | |||
index[ax] = | |||
AIdx::make_interval(ax, None, cv(const_extent), None); | |||
else | |||
index[ax] = AIdx::make_interval(ax, None, None, cv(1)); | |||
} | |||
auto o = opr::Subtensor::make(x, index); | |||
return o.node(); | |||
}; | |||
auto checker = [axis](const VarNodeArray& vars) { | |||
mgb_assert(vars.size() == 2); | |||
return vars[0]->shape().ndim > axis; | |||
}; | |||
return std::make_tuple(builder, checker); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -12,12 +12,27 @@ | |||
#include "megbrain/gopt/reformat_manager.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megbrain/utils/arith_helper.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
using NamedTensorShape = megdnn::NamedTensorShape; | |||
using Dimension = megdnn::Dimension; | |||
namespace { | |||
int gcd(const int& p, const int& q) { | |||
int x = p, y = q; | |||
while (y != 0) { | |||
if (x < y) { | |||
y = (y % x); | |||
} else { | |||
x = (x % y); | |||
std::swap(x, y); | |||
} | |||
} | |||
return x; | |||
} | |||
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | |||
switch (format) { | |||
case TensorFormats::NCHW: | |||
@@ -371,6 +386,170 @@ ReformatManager::ReformatImpl ReformatManager::get( | |||
}) | |||
} | |||
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||
const VarNode* orig_var, TensorFormats orig_format, | |||
const ReformatKey& key) const { | |||
NamedTensorShape input_shape = | |||
tensor_formats_to_named_tensor_shape(key.input_format); | |||
NamedTensorShape output_shape = | |||
tensor_formats_to_named_tensor_shape(key.output_format); | |||
size_t input_alignment, output_alignment; | |||
size_t input_channel_idx, output_channel_idx; | |||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
if (input_shape[i].name() == Dimension::Name::C && | |||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
input_channel_idx = i; | |||
input_alignment = input_shape[i].stride(); | |||
break; | |||
} | |||
} | |||
for (size_t i = 0; i < output_shape.ndim; ++i) { | |||
if (output_shape[i].name() == Dimension::Name::C && | |||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
output_channel_idx = i; | |||
output_alignment = output_shape[i].stride(); | |||
break; | |||
} | |||
} | |||
NamedTensorShape orig_shape = | |||
tensor_formats_to_named_tensor_shape(orig_format); | |||
size_t orig_channel = 0; | |||
for (size_t i = 0; i < orig_shape.ndim; ++i) { | |||
if (orig_shape[i].name() == Dimension::Name::C && | |||
orig_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
orig_channel = orig_var->shape()[i] * orig_shape[i].stride(); | |||
break; | |||
} | |||
} | |||
mgb_assert(orig_channel > 0, | |||
"incompatible NamedTensorShape for feature(got:%s)", | |||
orig_shape.to_string().c_str()); | |||
size_t aligned_in_channel = | |||
divup(orig_channel, input_alignment) * input_alignment; | |||
size_t aligned_out_channel = | |||
divup(orig_channel, output_alignment) * output_alignment; | |||
size_t common_alignment = input_alignment * output_alignment / | |||
gcd(input_alignment, output_alignment); | |||
size_t aligned_channel = | |||
divup(orig_channel, common_alignment) * common_alignment; | |||
auto builder = [key, aligned_channel, aligned_in_channel, | |||
aligned_out_channel, input_shape, input_channel_idx, | |||
output_shape, | |||
output_channel_idx](const VarNodeArray& vars) { | |||
VarNode *x, *cur; | |||
x = cur = vars[0]; | |||
if (aligned_channel > aligned_in_channel) { | |||
auto padding_shape = input_shape; | |||
auto&& dim = padding_shape[input_channel_idx]; | |||
size_t const_extent = | |||
(aligned_channel - aligned_in_channel) / dim.stride(); | |||
padding_shape[input_channel_idx] = | |||
Dimension(dim.name(), dim.stride(), const_extent); | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({x}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
cur = ReformatManager::instance().get(key)({cur}); | |||
if (aligned_channel > aligned_out_channel) { | |||
auto&& dim = output_shape[output_channel_idx]; | |||
size_t const_extent = aligned_out_channel / dim.stride(); | |||
auto sub = std::get<0>( | |||
SubtensorEmitter{const_extent, output_channel_idx}.emit()); | |||
cur = sub({cur}); | |||
} | |||
return cur; | |||
}; | |||
return builder; | |||
} | |||
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
const VarNode* orig_var, const ReformatKey& key, | |||
const AlignmentDesc& extra_alignment) const { | |||
size_t in_channels = 0, out_channels = 0; | |||
size_t input_channel_idx, output_channel_idx; | |||
Dimension::Name out_channel_name; | |||
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | |||
for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
if (input_shape[i].name() == Dimension::Name::C && | |||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
in_channels = orig_var->shape()[i]; | |||
input_channel_idx = i; | |||
mgb_assert(input_shape[i].stride() == 1, | |||
"unsupport weight format(got:%s)", | |||
input_shape.to_string().c_str()); | |||
} else if ((input_shape[i].name() == Dimension::Name::K || | |||
input_shape[i].name() == Dimension::Name::N) && | |||
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
out_channels = orig_var->shape()[i]; | |||
out_channel_name = input_shape[i].name(); | |||
output_channel_idx = i; | |||
mgb_assert(input_shape[i].stride() == 1, | |||
"unsupport weight format(got:%s)", | |||
input_shape.to_string().c_str()); | |||
} | |||
} | |||
size_t in_channel_alignment, out_channel_alignment = 1; | |||
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | |||
for (size_t i = 0; i < output_shape.ndim; ++i) { | |||
if (output_shape[i].name() == Dimension::Name::C && | |||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
in_channel_alignment = output_shape[i].stride(); | |||
} else if (output_shape[i].name() == out_channel_name && | |||
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
out_channel_alignment = output_shape[i].stride(); | |||
} | |||
} | |||
size_t aligned_in_channel = | |||
divup(in_channels, in_channel_alignment) * in_channel_alignment; | |||
if (extra_alignment.name == out_channel_name) { | |||
out_channel_alignment = | |||
extra_alignment.alignment * out_channel_alignment / | |||
gcd(extra_alignment.alignment, out_channel_alignment); | |||
} | |||
size_t aligned_out_channel = | |||
divup(out_channels, out_channel_alignment) * out_channel_alignment; | |||
auto builder = [key, input_shape, in_channels, input_channel_idx, | |||
aligned_in_channel, out_channels, output_channel_idx, | |||
aligned_out_channel](const VarNodeArray& vars) { | |||
VarNode *x, *cur; | |||
x = cur = vars[0]; | |||
if (aligned_in_channel > in_channels) { | |||
auto padding_shape = input_shape; | |||
auto&& dim = padding_shape[input_channel_idx]; | |||
size_t const_extent = | |||
(aligned_in_channel - in_channels) / dim.stride(); | |||
padding_shape[input_channel_idx] = | |||
Dimension(dim.name(), dim.stride(), const_extent); | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({x}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
if (aligned_out_channel > out_channels) { | |||
auto padding_shape = input_shape; | |||
auto&& dim = padding_shape[output_channel_idx]; | |||
size_t const_extent = | |||
(aligned_out_channel - out_channels) / dim.stride(); | |||
padding_shape[output_channel_idx] = | |||
Dimension(dim.name(), dim.stride(), const_extent); | |||
auto make_shape = std::get<0>( | |||
MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
auto padding_shp_var = make_shape({cur}); | |||
auto padding = std::get<0>( | |||
PaddingEmitter{const_extent, output_channel_idx}.emit()); | |||
cur = padding({cur, padding_shp_var}); | |||
} | |||
cur = ReformatManager::instance().get(key)({cur}); | |||
return cur; | |||
}; | |||
return builder; | |||
} | |||
const ReformatManager& ReformatManager::instance() { | |||
static ReformatManager inst; | |||
return inst; | |||
@@ -77,6 +77,26 @@ private: | |||
}; | |||
UnderlyingBuilders analyze() const; | |||
}; | |||
class PaddingEmitter final : public Emitter { | |||
public: | |||
PaddingEmitter(size_t const_extent, size_t axis) | |||
: m_const_extent{const_extent}, m_axis{axis} {} | |||
EmitResult emit() const override; | |||
private: | |||
size_t m_const_extent, m_axis; | |||
}; | |||
class SubtensorEmitter final : public Emitter { | |||
public: | |||
SubtensorEmitter(size_t const_extent, size_t axis) | |||
: m_const_extent{const_extent}, m_axis{axis} {} | |||
EmitResult emit() const override; | |||
private: | |||
size_t m_const_extent, m_axis; | |||
}; | |||
} // namespace gopt | |||
} // namespace mgb | |||
@@ -101,12 +101,21 @@ public: | |||
ReformatKey::Equal>; | |||
ReformatImpl get(const ReformatKey& key) const; | |||
ReformatImpl get(ReformatKey&& key) const { return get(key); } | |||
ReformatImpl auto_aligned_reformat_featrue(const VarNode* orig_var, | |||
TensorFormats orig_format, | |||
const ReformatKey& key) const; | |||
struct AlignmentDesc { | |||
megdnn::Dimension::Name name; | |||
size_t alignment; | |||
}; | |||
ReformatImpl auto_aligned_reformat_weight( | |||
const VarNode* orig_var, const ReformatKey& key, | |||
const AlignmentDesc& extra_alignment = {}) const; | |||
static const ReformatManager& instance(); | |||
private: | |||
ReformatCache m_cache; | |||
}; | |||
} // namespace gopt | |||
} // namespace mgb | |||
@@ -13,7 +13,10 @@ | |||
#include "./helper.h" | |||
#include "megbrain/gopt/reformat_manager.h" | |||
#include "megbrain/graph/event.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megbrain/plugin/base.h" | |||
#include "megbrain/plugin/profiler.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
@@ -168,4 +171,287 @@ TEST(TestReformatManager, InputChannelSmall) { | |||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
} | |||
TEST(TestReformatManager, AutoAlignedFeature) { | |||
constexpr size_t N = 16, C = 22, H = 55, W = 55; | |||
HostTensorGenerator<> gen; | |||
using ReformatKey = ReformatManager::ReformatKey; | |||
auto src_format = TensorFormats::NCHWc4, | |||
dst_format = TensorFormats::NCHWc32; | |||
ReformatKey key{src_format, dst_format}; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||
auto mkvar = [&](const char* name, | |||
const std::shared_ptr<HostTensorND>& host_val) { | |||
return opr::Host2DeviceCopy::make(*graph, host_val).rename(name); | |||
}; | |||
auto orig_x = mkvar("orig_x", host_orig_x); | |||
auto x = mkvar("x", host_x); | |||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
orig_x.node(), TensorFormats::NCHW, key); | |||
auto y = builder({x.node()}); | |||
HostTensorND t; | |||
auto func = graph->compile({make_callback_copy(y, t)}); | |||
func->execute(); | |||
*host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4}); | |||
func->execute(); | |||
*host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4}); | |||
func->execute(); | |||
auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32}; | |||
ASSERT_TRUE(shp.eq_shape(t.shape())); | |||
} | |||
TEST(TestReformatManager, AutoAlignedFeatureB4) { | |||
constexpr size_t N = 16, C = 94, H = 55, W = 55; | |||
HostTensorGenerator<> gen; | |||
using ReformatKey = ReformatManager::ReformatKey; | |||
auto src_format = TensorFormats::NCHWc4, | |||
dst_format = TensorFormats::NCHWc64; | |||
ReformatKey key{src_format, dst_format}; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||
auto mkvar = [&](const char* name, | |||
const std::shared_ptr<HostTensorND>& host_val, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, host_val).rename(name), | |||
dtype); | |||
}; | |||
auto orig_x = mkvar("orig_x", host_orig_x, | |||
dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(8))); | |||
auto x = mkvar("x", host_x, | |||
dtype::Quantized4Asymm(25.f, static_cast<uint8_t>(4))); | |||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
orig_x.node(), TensorFormats::NCHW, key); | |||
auto y = builder({x.node()}); | |||
HostTensorND t; | |||
auto func = graph->compile({make_callback_copy(y, t)}); | |||
func->execute(); | |||
} | |||
TEST(TestReformatManager, AutoAlignedWeight) { | |||
constexpr size_t K = 32, C = 32, R = 3, S = 3; | |||
HostTensorGenerator<> gen; | |||
using ReformatKey = ReformatManager::ReformatKey; | |||
auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64; | |||
ReformatKey key{src_format, dst_format}; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp) { | |||
return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name); | |||
}; | |||
auto w = mkvar("w", {K, C, R, S}); | |||
auto builder = ReformatManager::instance().auto_aligned_reformat_weight( | |||
w.node(), key, | |||
ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64}); | |||
auto y = builder({w.node()}); | |||
HostTensorND t; | |||
auto func = graph->compile({make_callback_copy(y, t)}); | |||
func->execute(); | |||
} | |||
#if MGB_CUDA | |||
#include "megbrain/comp_node_env.h" | |||
namespace { | |||
class ReformatProfiler : public PluginBase { | |||
using CompNodeEventPtr = std::unique_ptr<CompNode::Event>; | |||
public: | |||
class MarkInputContiguous; | |||
ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start, | |||
cg::OperatorNodeBase* opr_end); | |||
~ReformatProfiler() noexcept; | |||
double duration() const; | |||
private: | |||
CompNodeEventPtr m_start, m_end; | |||
cg::OperatorNodeBase *m_opr_start, *m_opr_end; | |||
}; | |||
ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph, | |||
cg::OperatorNodeBase* opr_start, | |||
cg::OperatorNodeBase* opr_end) | |||
: PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) { | |||
using namespace cg::event; | |||
auto on_reformat_start = [this](BeforeKernel const& event) { | |||
auto opr = event.opr; | |||
if (opr != m_opr_start) | |||
return; | |||
if (m_start == nullptr) { | |||
m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||
} | |||
m_start->record(); | |||
}; | |||
auto on_reformat_end = [this](AfterKernel const& event) { | |||
auto opr = event.opr; | |||
if (opr != m_opr_end) | |||
return; | |||
if (m_end == nullptr) { | |||
m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||
} | |||
m_end->record(); | |||
}; | |||
auto&& ev = graph->event(); | |||
add_event_handler(ev.register_receiver<BeforeKernel>(on_reformat_start)); | |||
add_event_handler(ev.register_receiver<AfterKernel>(on_reformat_end)); | |||
} | |||
ReformatProfiler::~ReformatProfiler() noexcept { | |||
if (m_start) | |||
m_start->host_wait(); | |||
if (m_end) | |||
m_end->host_wait(); | |||
} | |||
double ReformatProfiler::duration() const { | |||
mgb_assert(m_end); | |||
m_end->host_wait(); | |||
return m_start->elapsed_time_until(*m_end) - | |||
m_start->elapsed_time_until(*m_start); | |||
} | |||
MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous, | |||
cg::SingleCNOperatorNodeBase) // { | |||
void scn_do_execute() override{}; | |||
void init_output_static_infer_desc() override; | |||
void add_input_layout_constraint() override; | |||
public: | |||
MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config); | |||
static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {}); | |||
}; // namespace | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous); | |||
ReformatProfiler::MarkInputContiguous::MarkInputContiguous( | |||
VarNode* node, const OperatorNodeConfig& config) | |||
: Super(node->owner_graph(), config, "mark_contiguous", {node}) { | |||
add_input({node}); | |||
add_output(None); | |||
} | |||
SymbolVar ReformatProfiler::MarkInputContiguous::make( | |||
SymbolVar node, const OperatorNodeConfig& config) { | |||
return node.insert_single_output_opr<MarkInputContiguous>(node.node(), | |||
config); | |||
} | |||
void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() { | |||
using namespace cg::static_infer; | |||
auto&& mgr = owner_graph()->static_infer_manager(); | |||
mgr.register_shape_infer(output(0), | |||
ShapeInferDesc::make_identity(input(0))); | |||
} | |||
void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() { | |||
input(0)->add_layout_constraint_contiguous(); | |||
} | |||
class CUTimer { | |||
public: | |||
CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1) | |||
: m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} { | |||
reset(); | |||
} | |||
void reset() { | |||
m_started = false; | |||
m_stopped = false; | |||
} | |||
void start() { | |||
mgb_assert(!m_started); | |||
mgb_assert(!m_stopped); | |||
m_started = true; | |||
cudaEventRecord(m_evt0, m_stream); | |||
} | |||
void stop() { | |||
mgb_assert(m_started); | |||
mgb_assert(!m_stopped); | |||
m_stopped = true; | |||
cudaEventRecord(m_evt1, m_stream); | |||
} | |||
size_t get_time_in_us() const { | |||
cudaStreamSynchronize(m_stream); | |||
float t = -1; | |||
cudaEventElapsedTime(&t, m_evt0, m_evt1); | |||
return static_cast<size_t>(t * 1e3); | |||
} | |||
private: | |||
bool m_started, m_stopped; | |||
size_t m_start_point, m_stop_point; | |||
cudaStream_t& m_stream; | |||
cudaEvent_t &m_evt0, &m_evt1; | |||
}; | |||
} // namespace | |||
TEST(TestReformatManager, AutoAlignedFeatureProfiling) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpux"); | |||
using ReformatKey = ReformatManager::ReformatKey; | |||
auto dtype = dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(4)); | |||
HostTensorND hval(cn, dtype); | |||
constexpr size_t N = 16, C = 18, H = 55, W = 55; | |||
hval.resize({N, (C + 63) / 64, H, W, 64}); | |||
std::shared_ptr<DeviceTensorND> dval = | |||
std::make_shared<DeviceTensorND>(cn, dtype); | |||
dval->copy_from(hval).sync(); | |||
std::shared_ptr<DeviceTensorND> dprime = | |||
std::make_shared<DeviceTensorND>(cn, dtype); | |||
dprime->resize({N, C, H, W}); | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
graph->options().var_sanity_check_first_run = false; | |||
auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||
auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime); | |||
ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW}; | |||
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
xprime.node(), TensorFormats::NCHW, key); | |||
auto y = builder({x.node()}); | |||
auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y)); | |||
auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); }; | |||
auto output_spec = std::make_pair(mark, cb); | |||
auto func = graph->compile({output_spec}); | |||
static constexpr size_t RUNS = 100; | |||
cn.activate(); | |||
auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream; | |||
cudaEvent_t evt0; | |||
cudaEvent_t evt1; | |||
MGB_CUDA_CHECK(cudaEventCreate(&evt0)); | |||
MGB_CUDA_CHECK(cudaEventCreate(&evt1)); | |||
CUTimer timer(stream, evt0, evt1); | |||
timer.start(); | |||
for (size_t i = 0; i < RUNS; ++i) | |||
func->execute(); | |||
timer.stop(); | |||
double time_cuda_evt = timer.get_time_in_us() / static_cast<double>(RUNS); | |||
OperatorNodeBase* start = x.node()->owner_opr(); | |||
OperatorNodeBase* end = y->owner_opr(); | |||
std::unique_ptr<ReformatProfiler> profiler = | |||
std::make_unique<ReformatProfiler>(graph.get(), start, end); | |||
ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W})); | |||
for (size_t i = 0; i < RUNS; ++i) | |||
func->execute(); | |||
double time_profiler = profiler->duration() * 1e6; | |||
printf("%f, %f\n", time_profiler, time_cuda_evt); | |||
ASSERT_EQ(time_cuda_evt, time_profiler); | |||
MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | |||
MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |