Browse Source

fix(mgb/gopt): fix profiler cache when build without opr type info name

GitOrigin-RevId: 0eba678f2b
release-1.7
Megvii Engine Team 3 years ago
parent
commit
a4ac5e7e8f
11 changed files with 11245 additions and 3779 deletions
  1. +20
    -18
      src/core/impl/utils/infile_persistent_cache.cpp
  2. +5
    -4
      src/core/include/megbrain/utils/infile_persistent_cache.h
  3. +1
    -1
      src/core/include/megbrain/utils/persistent_cache.h
  4. +27
    -18
      src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
  5. +10
    -4
      src/gopt/impl/global_layout_transform/opr_safe_dump.h
  6. +19
    -23
      src/gopt/impl/global_layout_transform/profiler_cache.cpp
  7. +22
    -31
      src/gopt/impl/global_layout_transform/profiler_impl.cpp
  8. +52
    -50
      src/gopt/include/megbrain/gopt/profiler.h
  9. +11042
    -3581
      src/gopt/test/cache_data.h
  10. +9
    -7
      src/gopt/test/embed_cache.py
  11. +38
    -42
      src/gopt/test/layout_transform_pass.cpp

+ 20
- 18
src/core/impl/utils/infile_persistent_cache.cpp View File

@@ -13,7 +13,7 @@


#if defined(_WIN32) #if defined(_WIN32)
#include <io.h> #include <io.h>
#define F_OK 0
#define F_OK 0
#define access(a, b) _access(a, b) #define access(a, b) _access(a, b)
#elif __linux__ || __unix__ || __APPLE__ #elif __linux__ || __unix__ || __APPLE__
#include <unistd.h> #include <unistd.h>
@@ -32,8 +32,9 @@ public:


template <typename T> template <typename T>
void read(T& val) { void read(T& val) {
static_assert(std::is_trivially_copyable<T>::value,
"only support trivially copyable type");
static_assert(
std::is_trivially_copyable<T>::value,
"only support trivially copyable type");
mgb_assert(m_offset + sizeof(T) <= m_size); mgb_assert(m_offset + sizeof(T) <= m_size);
memcpy(&val, m_ptr, sizeof(T)); memcpy(&val, m_ptr, sizeof(T));
m_offset += sizeof(T); m_offset += sizeof(T);
@@ -42,8 +43,9 @@ public:


template <typename T> template <typename T>
void read(T* buf, size_t size) { void read(T* buf, size_t size) {
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
"only support read bytes");
static_assert(
std::is_trivially_copyable<T>::value && sizeof(T) == 1,
"only support read bytes");
mgb_assert(m_offset + size <= m_size); mgb_assert(m_offset + size <= m_size);
memcpy(buf, m_ptr, size); memcpy(buf, m_ptr, size);
m_offset += size; m_offset += size;
@@ -67,20 +69,21 @@ public:


template <typename T> template <typename T>
void read(T& val) { void read(T& val) {
static_assert(std::is_trivially_copyable<T>::value,
"only support trivially copyable type");
static_assert(
std::is_trivially_copyable<T>::value,
"only support trivially copyable type");
auto ret = fread(&val, sizeof(T), 1, m_fp); auto ret = fread(&val, sizeof(T), 1, m_fp);
mgb_assert(ret == 1); mgb_assert(ret == 1);
} }


template <typename T> template <typename T>
void read(T* buf, size_t size) { void read(T* buf, size_t size) {
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
"only support read bytes");
static_assert(
std::is_trivially_copyable<T>::value && sizeof(T) == 1,
"only support read bytes");
auto ret = fread(buf, size, 1, m_fp); auto ret = fread(buf, size, 1, m_fp);
mgb_assert(ret == 1); mgb_assert(ret == 1);
} }

}; };


//////////////////////// InFilePersistentCache::OutputFile /////////////// //////////////////////// InFilePersistentCache::OutputFile ///////////////
@@ -114,8 +117,8 @@ public:
//////////////////////// InFilePersistentCache::BlobStorage /////////////// //////////////////////// InFilePersistentCache::BlobStorage ///////////////


template <typename Input> template <typename Input>
InFilePersistentCache::BlobStorage&
InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_from_input(
Input& inp) {
uint32_t data_size; uint32_t data_size;
inp.read(data_size); inp.read(data_size);
size = data_size; size = data_size;
@@ -125,15 +128,14 @@ InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
return *this; return *this;
} }


void InFilePersistentCache::BlobStorage::write_to_file(
OutputFile& out_file) const {
void InFilePersistentCache::BlobStorage::write_to_file(OutputFile& out_file) const {
uint32_t u_size = size; uint32_t u_size = size;
out_file.write(u_size); out_file.write(u_size);
out_file.write(data_refhold.get(), u_size); out_file.write(data_refhold.get(), u_size);
} }


InFilePersistentCache::BlobStorage&
InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) {
InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_data_ref(
const Blob& b) {
data_refhold = std::make_unique<uint8_t[]>(b.size + 1); data_refhold = std::make_unique<uint8_t[]>(b.size + 1);
memcpy(data_refhold.get(), b.ptr, b.size); memcpy(data_refhold.get(), b.ptr, b.size);
data_refhold.get()[b.size] = 0; // for C-string safety data_refhold.get()[b.size] = 0; // for C-string safety
@@ -227,8 +229,8 @@ Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get(
return iter1->second; return iter1->second;
} }


void InFilePersistentCache::put(const std::string& category, const Blob& key,
const Blob& value) {
void InFilePersistentCache::put(
const std::string& category, const Blob& key, const Blob& value) {
BlobStorage key_storage; BlobStorage key_storage;
key_storage.init_data_ref(key).init_hash(); key_storage.init_data_ref(key).init_hash();




+ 5
- 4
src/core/include/megbrain/utils/infile_persistent_cache.h View File

@@ -49,13 +49,15 @@ class InFilePersistentCache final : public PersistentCache {
size_t operator()(const BlobStorage& b) const { return b.hash; } size_t operator()(const BlobStorage& b) const { return b.hash; }
}; };
}; };
std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
BlobStorage::Hash>>
std::unordered_map<
std::string,
std::unordered_map<BlobStorage, BlobStorage, BlobStorage::Hash>>
m_cache; m_cache;
MGB_MUTEX m_mtx; MGB_MUTEX m_mtx;


template <typename Input> template <typename Input>
void read_cache(Input& inp); void read_cache(Input& inp);

public: public:
InFilePersistentCache() = default; InFilePersistentCache() = default;
InFilePersistentCache(const char* path); InFilePersistentCache(const char* path);
@@ -68,8 +70,7 @@ public:
void dump_cache(const char* path); void dump_cache(const char* path);


Maybe<Blob> get(const std::string& category, const Blob& key) override; Maybe<Blob> get(const std::string& category, const Blob& key) override;
void put(const std::string& category, const Blob& key,
const Blob& value) override;
void put(const std::string& category, const Blob& key, const Blob& value) override;
bool support_dump_cache() override { return true; } bool support_dump_cache() override { return true; }
}; };
} // namespace mgb } // namespace mgb


+ 1
- 1
src/core/include/megbrain/utils/persistent_cache.h View File

@@ -40,7 +40,7 @@ public:
const std::string& category, const Blob& key, const Blob& value) = 0; const std::string& category, const Blob& key, const Blob& value) = 0;


virtual bool support_dump_cache() { return false; } virtual bool support_dump_cache() { return false; }
//! set an implementation; return the original implementation //! set an implementation; return the original implementation
static std::shared_ptr<PersistentCache> set_impl( static std::shared_ptr<PersistentCache> set_impl(
std::shared_ptr<PersistentCache> impl); std::shared_ptr<PersistentCache> impl);


+ 27
- 18
src/gopt/impl/global_layout_transform/opr_safe_dump.cpp View File

@@ -18,6 +18,7 @@
#include "megbrain/opr/nn_int.h" #include "megbrain/opr/nn_int.h"
#include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/tensor_manip.h"


#include "megbrain/utils/hash_ct.h"
#include "midout.h" #include "midout.h"
MIDOUT_DECL(megbrain_opr_safe_dump) MIDOUT_DECL(megbrain_opr_safe_dump)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) { #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) {
@@ -38,24 +39,34 @@ template <>
void write_param(std::string& /* data */, const DType& /* dtype */) {} void write_param(std::string& /* data */, const DType& /* dtype */) {}


template <class Opr> template <class Opr>
struct OprDumpImpl {
static std::string dump(const cg::OperatorNodeBase* opr_) {
MIDOUT_B(Opr)
auto&& opr = opr_->cast_final_safe<Opr>();
std::string data;
write_param(data, opr.param());
return data;
MIDOUT_E
}
};
struct OprDumpImpl;


#define INST(_Opr) \
#define cb(_Opr) \
template <> \
struct OprDumpImpl<_Opr> { \
static std::string dump(const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
std::string data; \
auto opr_hash = MGB_HASH_STR(#_Opr); \
write_param(data, opr_hash); \
write_param(data, opr.param()); \
return data; \
MIDOUT_E \
} \
};
FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb)
#undef cb

#define cb(_Opr) \
template <> \ template <> \
struct OprDumpImpl<_Opr> { \ struct OprDumpImpl<_Opr> { \
static std::string dump(const cg::OperatorNodeBase* opr_) { \ static std::string dump(const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \ MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \ auto&& opr = opr_->cast_final_safe<_Opr>(); \
std::string data; \ std::string data; \
auto opr_hash = MGB_HASH_STR(#_Opr); \
write_param(data, opr_hash); \
write_param(data, opr.param()); \ write_param(data, opr.param()); \
using ExecutionPolicy = megdnn::param::ExecutionPolicy; \ using ExecutionPolicy = megdnn::param::ExecutionPolicy; \
ExecutionPolicy policy{ \ ExecutionPolicy policy{ \
@@ -66,11 +77,8 @@ struct OprDumpImpl {
MIDOUT_E \ MIDOUT_E \
} \ } \
}; };
INST(Convolution);
INST(ConvBiasForward);
INST(ConvolutionBackwardData);
INST(PoolingForward);
#undef INST
FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb)
#undef cb
} // namespace } // namespace


namespace mgb { namespace mgb {
@@ -83,8 +91,9 @@ std::string opr_safe_dump(const cg::OperatorNodeBase* opr) {
return OprDumpImpl<_Opr>::dump(opr); \ return OprDumpImpl<_Opr>::dump(opr); \
} else } else
FOREACH_SUPPORTED_OPR(cb) { FOREACH_SUPPORTED_OPR(cb) {
mgb_throw(InternalError, "unsupported operator(got:%s)",
opr->dyn_typeinfo()->name);
mgb_throw(
InternalError, "unsupported operator(got:%s)",
opr->dyn_typeinfo()->name);
} }
#undef cb #undef cb
} }


+ 10
- 4
src/gopt/impl/global_layout_transform/opr_safe_dump.h View File

@@ -16,10 +16,16 @@
namespace mgb { namespace mgb {
namespace gopt { namespace gopt {
namespace intl { namespace intl {
#define FOREACH_SUPPORTED_OPR(cb) \
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \
cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt)
#define FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) \
cb(WarpPerspective) cb(Resize) cb(Elemwise) cb(ElemwiseMultiType) cb(Concat) \
cb(PowC) cb(TypeCvt)

#define FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb) \
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward)

#define FOREACH_SUPPORTED_OPR(cb) \
FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) \
FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb)


std::string opr_safe_dump(const cg::OperatorNodeBase* opr); std::string opr_safe_dump(const cg::OperatorNodeBase* opr);




+ 19
- 23
src/gopt/impl/global_layout_transform/profiler_cache.cpp View File

@@ -11,8 +11,8 @@
*/ */


#include "./opr_safe_dump.h" #include "./opr_safe_dump.h"
#include "megbrain/gopt/profiler.h"
#include "megbrain/comp_node_env.h" #include "megbrain/comp_node_env.h"
#include "megbrain/gopt/profiler.h"


using namespace mgb; using namespace mgb;
using namespace gopt; using namespace gopt;
@@ -21,9 +21,6 @@ using ReformatKey = ReformatManager::ReformatKey;
// =================== ProfilerCache ====================== // =================== ProfilerCache ======================
void ProfilerCache::Key::build_blob_from_opr() { void ProfilerCache::Key::build_blob_from_opr() {
auto&& opr = m_key_impl.opr_key.opr; auto&& opr = m_key_impl.opr_key.opr;
// process opr type
auto type = opr->dyn_typeinfo()->name;
size_t type_size = strlen(type);


// process opr param // process opr param
auto data = intl::opr_safe_dump(opr); auto data = intl::opr_safe_dump(opr);
@@ -32,11 +29,7 @@ void ProfilerCache::Key::build_blob_from_opr() {
size_t nr_inputs = opr->input().size(); size_t nr_inputs = opr->input().size();
size_t nr_outputs = opr->usable_output().size(); size_t nr_outputs = opr->usable_output().size();
size_t nr_layouts = nr_inputs + nr_outputs; size_t nr_layouts = nr_inputs + nr_outputs;
m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size +
param_size);

// serialize opr type
m_blob_storage.append(type, type_size);
m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + param_size);


// serialize param // serialize param
const char* data_ptr = reinterpret_cast<const char*>(data.data()); const char* data_ptr = reinterpret_cast<const char*>(data.data());
@@ -70,12 +63,12 @@ void ProfilerCache::Key::build_blob_from_opr() {
} }


// serialize opr_format // serialize opr_format
m_blob_storage.append(std::to_string(
static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));
m_blob_storage.append(
std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));


// serialize extra_attribute // serialize extra_attribute
m_blob_storage.append(std::to_string(
static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
m_blob_storage.append(
std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
} }


void ProfilerCache::Key::build_category(CompNode cn) { void ProfilerCache::Key::build_category(CompNode cn) {
@@ -85,8 +78,8 @@ void ProfilerCache::Key::build_category(CompNode cn) {
#if MGB_CUDA #if MGB_CUDA
case CompNode::DeviceType::CUDA: { case CompNode::DeviceType::CUDA: {
auto&& prop = env.cuda_env().device_prop; auto&& prop = env.cuda_env().device_prop;
m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name,
prop.major, prop.minor);
m_category += ssprintf(
"plat=cuda;dev=%s;cap=%d.%d", prop.name, prop.major, prop.minor);
break; break;
} }
#endif #endif
@@ -94,9 +87,10 @@ void ProfilerCache::Key::build_category(CompNode cn) {
m_category += "plat=cpu"; m_category += "plat=cpu";
break; break;
default: default:
mgb_throw(MegBrainError,
"unsupported comp node for global layout transform "
"profiler cache category");
mgb_throw(
MegBrainError,
"unsupported comp node for global layout transform "
"profiler cache category");
} }
} }


@@ -151,9 +145,10 @@ ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) {
} }


void ProfilerCache::dump_cache(const char* path) { void ProfilerCache::dump_cache(const char* path) {
mgb_assert(m_impl->support_dump_cache(),
"current impl of ProfilerCache does not support dump cache to "
"file.");
mgb_assert(
m_impl->support_dump_cache(),
"current impl of ProfilerCache does not support dump cache to "
"file.");
auto cache = static_cast<InFilePersistentCache*>(m_impl.get()); auto cache = static_cast<InFilePersistentCache*>(m_impl.get());
cache->dump_cache(path); cache->dump_cache(path);
} }
@@ -165,8 +160,9 @@ Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) {
// data type of cost is float // data type of cost is float
auto buf = static_cast<const uint8_t*>(raw_buf->ptr); auto buf = static_cast<const uint8_t*>(raw_buf->ptr);
auto size = raw_buf->size; auto size = raw_buf->size;
mgb_assert(buf && size == sizeof(float),
"ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
mgb_assert(
buf && size == sizeof(float),
"ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
auto read_f32 = [&]() { auto read_f32 = [&]() {
auto ret = *reinterpret_cast<const float*>(buf); auto ret = *reinterpret_cast<const float*>(buf);
return ret; return ret;


+ 22
- 31
src/gopt/impl/global_layout_transform/profiler_impl.cpp View File

@@ -154,33 +154,30 @@ void MarkInputContiguous::init_output_static_infer_desc() {
} // namespace } // namespace


/* ================== ProfilerImpl =================*/ /* ================== ProfilerImpl =================*/
ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
float var_node_threshold)
ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, float var_node_threshold)
: m_opr_threshold{opr_threshold}, : m_opr_threshold{opr_threshold},
m_var_node_threshold{var_node_threshold}, m_var_node_threshold{var_node_threshold},
m_runs{runs} { m_runs{runs} {
m_opr_filter = [this](const OperatorNodeBase* opr,
OperatorNodeBase* new_opr) {
m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) {
/// \note: for the considerations of performance, we skip nchw(naive) /// \note: for the considerations of performance, we skip nchw(naive)
/// kernels for conv bias on CUDA platform. to remove this later /// kernels for conv bias on CUDA platform. to remove this later
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) { if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
if (conv->output(0)->comp_node().device_type() == if (conv->output(0)->comp_node().device_type() ==
CompNode::DeviceType::CUDA && CompNode::DeviceType::CUDA &&
conv->input(0)->dtype().category() ==
DTypeCategory::QUANTIZED &&
conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED &&
conv->param().format == OprFormat::NCHW) { conv->param().format == OprFormat::NCHW) {
return false; return false;
} }
} }
float comp1 = m_opr_footprint.get_computation(
const_cast<OperatorNodeBase*>(opr));
float comp1 =
m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr));
float comp2 = m_opr_footprint.get_computation(new_opr); float comp2 = m_opr_footprint.get_computation(new_opr);
if (comp2 > m_opr_threshold * comp1) if (comp2 > m_opr_threshold * comp1)
return false; return false;
return true; return true;
}; };
m_var_node_filter = [this](const VarNode* var, TensorShape from,
TensorShape to, ReformatKey key) {
m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to,
ReformatKey key) {
/// \note: due to the alignment requirement of low-bit tensor, we skip /// \note: due to the alignment requirement of low-bit tensor, we skip
/// some layout transform for low-bit tensors. The skipped layout /// some layout transform for low-bit tensors. The skipped layout
/// transforms do not have corresponding dnn kernel and cannot be /// transforms do not have corresponding dnn kernel and cannot be
@@ -202,8 +199,7 @@ ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
TensorLayout orig_ly = {var->shape(), var->dtype()}, TensorLayout orig_ly = {var->shape(), var->dtype()},
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
float orig_memory = orig_ly.span().dist_byte() * 2.f; float orig_memory = orig_ly.span().dist_byte() * 2.f;
float reformat_memory =
from_ly.span().dist_byte() + to_ly.span().dist_byte();
float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte();
if (reformat_memory > orig_memory * m_var_node_threshold) if (reformat_memory > orig_memory * m_var_node_threshold)
return false; return false;
return true; return true;
@@ -537,23 +533,20 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
return std::make_unique<ProfilerImpl>(); return std::make_unique<ProfilerImpl>();
} }


std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(
const char* path) {
std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(const char* path) {
return std::make_unique<CachedProfiler>(path); return std::make_unique<CachedProfiler>(path);
} }


/* ================== CachedProfiler =================*/ /* ================== CachedProfiler =================*/
CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold,
float var_node_threshold)
CachedProfiler::CachedProfiler(
const char* path, int runs, float opr_threshold, float var_node_threshold)
: ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} { : ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} {
if (m_path != nullptr) { // file cache if (m_path != nullptr) { // file cache
ProfilerCache::inst().set_impl(
std::make_unique<InFilePersistentCache>(m_path));
ProfilerCache::inst().set_impl(std::make_unique<InFilePersistentCache>(m_path));
} }
} }


CachedProfiler::ProfilingResult CachedProfiler::profile(
const Problem& problem) const {
CachedProfiler::ProfilingResult CachedProfiler::profile(const Problem& problem) const {
auto ret = ProfilerImpl::profile(problem); auto ret = ProfilerImpl::profile(problem);
if (m_path != nullptr) if (m_path != nullptr)
ProfilerCache::inst().dump_cache(m_path); ProfilerCache::inst().dump_cache(m_path);
@@ -563,35 +556,33 @@ CachedProfiler::ProfilingResult CachedProfiler::profile(
float CachedProfiler::profile_operator( float CachedProfiler::profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format, const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format, ReformatAttribute extra_attribute) const { TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
extra_attribute};
ProfilerCache::Key key{
opr, tensor_formats_to_opr_format(tensor_format), extra_attribute};
auto ret = ProfilerCache::inst().get(key); auto ret = ProfilerCache::inst().get(key);
if (ret.valid()) if (ret.valid())
return ret.val(); return ret.val();
auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format,
extra_attribute);
auto rst = ProfilerImpl::profile_operator(
opr, base_format, tensor_format, extra_attribute);
ProfilerCache::inst().put(key, rst); ProfilerCache::inst().put(key, rst);
return rst; return rst;
} }


float CachedProfiler::profile_operator( float CachedProfiler::profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OperatorNodeBase* opr, const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config, const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute) const { ReformatAttribute extra_attribute) const {
ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
auto ret = ProfilerCache::inst().get(key); auto ret = ProfilerCache::inst().get(key);
if (ret.valid()) if (ret.valid())
return ret.val(); return ret.val();
auto rst = ProfilerImpl::profile_operator(opr, base_config, config,
extra_attribute);
auto rst =
ProfilerImpl::profile_operator(opr, base_config, config, extra_attribute);
ProfilerCache::inst().put(key, rst); ProfilerCache::inst().put(key, rst);
return rst; return rst;
} }


float CachedProfiler::profile_var_node(const VarNode* var,
TensorFormats base_format,
const ReformatKey& key) const {
float CachedProfiler::profile_var_node(
const VarNode* var, TensorFormats base_format, const ReformatKey& key) const {
ProfilerCache::Key pf_key{var, key}; ProfilerCache::Key pf_key{var, key};
auto ret = ProfilerCache::inst().get(pf_key); auto ret = ProfilerCache::inst().get(pf_key);
if (ret.valid()) if (ret.valid())


+ 52
- 50
src/gopt/include/megbrain/gopt/profiler.h View File

@@ -78,7 +78,7 @@ public:
const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>; const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>;


ProfilerBase() = default; ProfilerBase() = default;
virtual ~ProfilerBase() = default; virtual ~ProfilerBase() = default;


virtual ProfilingResult profile(const Problem& problem) const = 0; virtual ProfilingResult profile(const Problem& problem) const = 0;
@@ -102,13 +102,12 @@ protected:
VarNodeFilter m_var_node_filter; VarNodeFilter m_var_node_filter;
}; };



/*! \brief A default profiler impl /*! \brief A default profiler impl
*/ */
class ProfilerImpl : public ProfilerBase { class ProfilerImpl : public ProfilerBase {
public: public:
ProfilerImpl(int runs = 10, float opr_threshold = 2.f,
float var_node_threshold = 2.f);
ProfilerImpl(
int runs = 10, float opr_threshold = 2.f, float var_node_threshold = 2.f);
~ProfilerImpl() = default; ~ProfilerImpl() = default;
ProfilingResult profile(const Problem& problem) const override; ProfilingResult profile(const Problem& problem) const override;


@@ -128,22 +127,22 @@ protected:
OperatorNodeRecord profile_operator( OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format, const OperatorNodeBase* opr, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats, const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*! /*!
* \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
* \brief prfile opr format agnostic operators (like elemwise, elemwise multi type,
* typecvt etc.)
* *
* \param opr pointer to the operator to be profiled * \param opr pointer to the operator to be profiled
* \param base_format the original tensor format of the operator node. * \param base_format the original tensor format of the operator node.
* \param tensor_format the tensor format to be profiled * \param tensor_format the tensor format to be profiled
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
* \return elapsed time of operator in the given tensor format configuration
* \param extra_attribute identify whether to use image object for OpenCL or
* automatically padding nhwc layout \return elapsed time of operator in the given
* tensor format configuration
*/ */
virtual float profile_operator( virtual float profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format, const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format, TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*! /*!
* \brief profile opr format aware operators (like conv, deconv, conv_bias, * \brief profile opr format aware operators (like conv, deconv, conv_bias,
* etc.) * etc.)
@@ -157,28 +156,29 @@ protected:
const OperatorNodeBase* opr, const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config, const OprTensorFormatsConfiguration& base_config,
const SmallVector<OprTensorFormatsConfiguration>& available_configs, const SmallVector<OprTensorFormatsConfiguration>& available_configs,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*! /*!
* \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.)
* \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize,
* warp etc.)
* *
* \param opr pointer to the operator to be profiled * \param opr pointer to the operator to be profiled
* \param base_config the original opr format configuration of the operator node,
* \param base_config the original opr format configuration of the operator node,
* \param config the opr format configuration to be profiled * \param config the opr format configuration to be profiled
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
* \return elapsed time of operator in the given opr format configuration
* \param extra_attribute identify whether to use image object for OpenCL or
* automatically padding nhwc layout \return elapsed time of operator in the given
* opr format configuration
*/ */
virtual float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
virtual float profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*! /*!
* \brief profile layout transform of the var node * \brief profile layout transform of the var node
* *
* \param var pointer to the var node to be profiled * \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is * \param base_format the original tensor formats in which the var node is
* stored
* stored
* \param available_tensor_formats the available tensor formats * \param available_tensor_formats the available tensor formats
* \param extra_attribute the extra attributes (options) of the problem * \param extra_attribute the extra attributes (options) of the problem
* \return the var node record * \return the var node record
@@ -186,27 +186,26 @@ protected:
VarNodeRecord profile_var_node( VarNodeRecord profile_var_node(
const VarNode* var, TensorFormats base_format, const VarNode* var, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats, const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*! /*!
* \brief profile layout transform of the var node * \brief profile layout transform of the var node
* *
* \param var pointer to the var node to be profiled * \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is * \param base_format the original tensor formats in which the var node is
* stored * stored
* \param key type of ReformatKey, identify the information/attributes of the layout transoform
* \return elapsed time of the layout transform
* \param key type of ReformatKey, identify the information/attributes of the layout
* transoform \return elapsed time of the layout transform
*/ */
virtual float profile_var_node(const VarNode* var,
TensorFormats base_format,
const ReformatKey& key) const;
virtual float profile_var_node(
const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const;
OprFootprint m_opr_footprint; OprFootprint m_opr_footprint;
float m_opr_threshold; /// a threshold, when the computation of the newly
/// created operator that is built in some opr
/// format configuration is as greater as
/// m_opr_threshold times of the original operator,
/// the opr format configuration will be skipped
/// (i.e. the cost is infinite)
float m_opr_threshold; /// a threshold, when the computation of the newly
/// created operator that is built in some opr
/// format configuration is as greater as
/// m_opr_threshold times of the original operator,
/// the opr format configuration will be skipped
/// (i.e. the cost is infinite)
float m_var_node_threshold; /// a threshold, when the memory footprint of float m_var_node_threshold; /// a threshold, when the memory footprint of
/// the layout transform of the var node is as /// the layout transform of the var node is as
/// larger as m_var_node_threshold as the var /// larger as m_var_node_threshold as the var
@@ -298,23 +297,26 @@ private:


class CachedProfiler final : public ProfilerImpl { class CachedProfiler final : public ProfilerImpl {
public: public:
CachedProfiler(const char* path = nullptr, int runs = 10,
float opr_threshold = 2.f, float var_node_threshold = 2.f);
CachedProfiler(
const char* path = nullptr, int runs = 10, float opr_threshold = 2.f,
float var_node_threshold = 2.f);
ProfilingResult profile(const Problem& problem) const override; ProfilingResult profile(const Problem& problem) const override;


private: private:
float profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override;
float profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_var_node(
const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override;
const char* m_path; const char* m_path;
}; };




+ 11042
- 3581
src/gopt/test/cache_data.h
File diff suppressed because it is too large
View File


+ 9
- 7
src/gopt/test/embed_cache.py View File

@@ -7,19 +7,21 @@
# software distributed under the License is distributed on an # software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.


# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 结果存到了 cache 里去,
# 每次跑测试会从内存里读取 cache 里的 profiling 结果,然后根据 profiling 结果去做全局图优化。
# 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件,用于测试时读取数据。
# 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试,则需要考虑用这个脚本来
# 处理一下 profiling 数据。
# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 数据存到了 cache 里去,
# 每次跑测试会从内存 cache 里读取 profiling 结果,然后根据 profiling 结果去做全局图优化,这样确保每次运行
# 结果都是一致的。
# ProfilerCache 可以支持把内存中 cache 下来的 profiling 数据 dump 成文件。
# 这个脚本就是用于把 dump 出去的 cache 文件打包成 cache 的头文件,用于测试时读取数据,构建 InMemory 的 ProfilerCache 。
# 如果在 src/gopt/test/layout_transform_pass.cpp 里新添加了全局图优化相关的测试,则需要考虑用这个脚本来
# 更新 cache 头文件中的 profiling 数据。
# 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为 # 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为
# `#define MGB_WITH_CACHED_TEST 0` # `#define MGB_WITH_CACHED_TEST 0`
# 2. 编译megbrain_test,并运行所有全局图优化相关测试: # 2. 编译megbrain_test,并运行所有全局图优化相关测试:
# ./megbrain_test --gtest_filter="*LayoutTransform*" # ./megbrain_test --gtest_filter="*LayoutTransform*"
# 3. 用这个脚本把所有的cache文件打包在一起 # 3. 用这个脚本把所有的cache文件打包在一起
# python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache) # python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache)
# 4. 将步骤1中的 define 改回去,这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test ,
# 验证测试是否正确。
# 4. 将步骤1中的 define 语句改回原样,这样 profile 过程就会使用 cache 下来的数据。
# 5. 最后可以重新构建一下 megbrain_test ,确保测试结果正确。
import os.path import os.path
import logging import logging
import hashlib import hashlib


+ 38
- 42
src/gopt/test/layout_transform_pass.cpp View File

@@ -78,8 +78,9 @@ OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
case TensorFormats::CHWNc4: case TensorFormats::CHWNc4:
return OprFormat::CHWN4; return OprFormat::CHWN4;
default: default:
mgb_throw(MegBrainError, "tensor format(%u) is not supported",
static_cast<uint32_t>(tensor_format));
mgb_throw(
MegBrainError, "tensor format(%u) is not supported",
static_cast<uint32_t>(tensor_format));
} }
} }


@@ -92,28 +93,28 @@ public:
} }
~ProfilerMock() { ~ProfilerMock() {
// reset in memory cache // reset in memory cache
ProfilerCache::inst().set_impl(
std::make_unique<InMemoryPersistentCache>());
ProfilerCache::inst().set_impl(std::make_unique<InMemoryPersistentCache>());
} }


private: private:
float profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
extra_attribute};
float profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
ProfilerCache::Key key{
opr, tensor_formats_to_opr_format(tensor_format), extra_attribute};
auto ret = ProfilerCache::inst().get(key); auto ret = ProfilerCache::inst().get(key);
if (ret.valid()) if (ret.valid())
return ret.val(); return ret.val();
mgb_assert(false); mgb_assert(false);
} }
float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
float profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
std::string tmp; std::string tmp;
tmp.reserve(key.blob().size); tmp.reserve(key.blob().size);
@@ -122,8 +123,9 @@ private:
return ret.val(); return ret.val();
mgb_assert(false); mgb_assert(false);
} }
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override {
float profile_var_node(
const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override {
ProfilerCache::Key pf_key{var, key}; ProfilerCache::Key pf_key{var, key};
auto ret = ProfilerCache::inst().get(pf_key); auto ret = ProfilerCache::inst().get(pf_key);
if (ret.valid()) if (ret.valid())
@@ -174,18 +176,17 @@ TEST(TestLayoutTransform, Resnet18_QS8) {
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
ReformatAttribute::AUTO_PADDING_NHWC}; ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>( auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC})
.add_opr_config(opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32,
OprFormat::NHWC, OprFormat::CHWN4});
std::move(opr_list), std::move(available_tensor_formats), attribute);
ctx->add_opr_config(
opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC})
.add_opr_config(
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
OprFormat::CHWN4});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Resnet18_QS8.data()),
static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS8.data()),
TestLayoutTransform_Resnet18_QS8.size()); TestLayoutTransform_Resnet18_QS8.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler( auto profiler = ProfilerBase::make_cached_profiler(
@@ -278,8 +279,7 @@ TEST(TestLayoutTransform, Resnet18_QS4) {
OprFormat::NHWC, OprFormat::CHWN4}); OprFormat::NHWC, OprFormat::CHWN4});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Resnet18_QS4.data()),
static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS4.data()),
TestLayoutTransform_Resnet18_QS4.size()); TestLayoutTransform_Resnet18_QS4.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler( auto profiler = ProfilerBase::make_cached_profiler(
@@ -401,8 +401,7 @@ TEST(TestLayoutTransform, Detection_QS8) {
OprFormat::NHWC, OprFormat::CHWN4}); OprFormat::NHWC, OprFormat::CHWN4});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Detection_QS8.data()),
static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS8.data()),
TestLayoutTransform_Detection_QS8.size()); TestLayoutTransform_Detection_QS8.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler( auto profiler = ProfilerBase::make_cached_profiler(
@@ -479,8 +478,7 @@ TEST(TestLayoutTransform, Detection_QS4) {
OprFormat::NHWC, OprFormat::CHWN4}); OprFormat::NHWC, OprFormat::CHWN4});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Detection_QS4.data()),
static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS4.data()),
TestLayoutTransform_Detection_QS4.size()); TestLayoutTransform_Detection_QS4.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler( auto profiler = ProfilerBase::make_cached_profiler(
@@ -553,17 +551,16 @@ TEST(TestLayoutTransform, Wide) {
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
ReformatAttribute::DEFAULT}; ReformatAttribute::DEFAULT};
auto ctx = std::make_unique<LayoutTransformContext>( auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW, OprFormat::NHWC});
std::move(opr_list), std::move(available_tensor_formats), attribute);
ctx->add_opr_config(
opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()), static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()),
TestLayoutTransform_Wide.size()); TestLayoutTransform_Wide.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Wide.cache");
auto profiler =
ProfilerBase::make_cached_profiler("TestLayoutTransform.Wide.cache");
#endif #endif
std::unique_ptr<SolverBase> solver{ std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))}; new DynamicProgrammingSolver(std::move(profiler))};
@@ -674,8 +671,7 @@ TEST(TestLayoutTransform, DetectionHead) {
{OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
#if MGB_WITH_CACHED_TEST #if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>( auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_DetectionHead.data()),
static_cast<const uint8_t*>(TestLayoutTransform_DetectionHead.data()),
TestLayoutTransform_DetectionHead.size()); TestLayoutTransform_DetectionHead.size());
#else #else
auto profiler = ProfilerBase::make_cached_profiler( auto profiler = ProfilerBase::make_cached_profiler(


Loading…
Cancel
Save