GitOrigin-RevId: 0eba678f2b
release-1.7
@@ -13,7 +13,7 @@ | |||
#if defined(_WIN32) | |||
#include <io.h> | |||
#define F_OK 0 | |||
#define F_OK 0 | |||
#define access(a, b) _access(a, b) | |||
#elif __linux__ || __unix__ || __APPLE__ | |||
#include <unistd.h> | |||
@@ -32,8 +32,9 @@ public: | |||
template <typename T> | |||
void read(T& val) { | |||
static_assert(std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
static_assert( | |||
std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
mgb_assert(m_offset + sizeof(T) <= m_size); | |||
memcpy(&val, m_ptr, sizeof(T)); | |||
m_offset += sizeof(T); | |||
@@ -42,8 +43,9 @@ public: | |||
template <typename T> | |||
void read(T* buf, size_t size) { | |||
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
static_assert( | |||
std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
mgb_assert(m_offset + size <= m_size); | |||
memcpy(buf, m_ptr, size); | |||
m_offset += size; | |||
@@ -67,20 +69,21 @@ public: | |||
template <typename T> | |||
void read(T& val) { | |||
static_assert(std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
static_assert( | |||
std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
auto ret = fread(&val, sizeof(T), 1, m_fp); | |||
mgb_assert(ret == 1); | |||
} | |||
template <typename T> | |||
void read(T* buf, size_t size) { | |||
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
static_assert( | |||
std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
auto ret = fread(buf, size, 1, m_fp); | |||
mgb_assert(ret == 1); | |||
} | |||
}; | |||
//////////////////////// InFilePersistentCache::OutputFile /////////////// | |||
@@ -114,8 +117,8 @@ public: | |||
//////////////////////// InFilePersistentCache::BlobStorage /////////////// | |||
template <typename Input> | |||
InFilePersistentCache::BlobStorage& | |||
InFilePersistentCache::BlobStorage::init_from_input(Input& inp) { | |||
InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_from_input( | |||
Input& inp) { | |||
uint32_t data_size; | |||
inp.read(data_size); | |||
size = data_size; | |||
@@ -125,15 +128,14 @@ InFilePersistentCache::BlobStorage::init_from_input(Input& inp) { | |||
return *this; | |||
} | |||
void InFilePersistentCache::BlobStorage::write_to_file( | |||
OutputFile& out_file) const { | |||
void InFilePersistentCache::BlobStorage::write_to_file(OutputFile& out_file) const { | |||
uint32_t u_size = size; | |||
out_file.write(u_size); | |||
out_file.write(data_refhold.get(), u_size); | |||
} | |||
InFilePersistentCache::BlobStorage& | |||
InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) { | |||
InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_data_ref( | |||
const Blob& b) { | |||
data_refhold = std::make_unique<uint8_t[]>(b.size + 1); | |||
memcpy(data_refhold.get(), b.ptr, b.size); | |||
data_refhold.get()[b.size] = 0; // for C-string safety | |||
@@ -227,8 +229,8 @@ Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get( | |||
return iter1->second; | |||
} | |||
void InFilePersistentCache::put(const std::string& category, const Blob& key, | |||
const Blob& value) { | |||
void InFilePersistentCache::put( | |||
const std::string& category, const Blob& key, const Blob& value) { | |||
BlobStorage key_storage; | |||
key_storage.init_data_ref(key).init_hash(); | |||
@@ -49,13 +49,15 @@ class InFilePersistentCache final : public PersistentCache { | |||
size_t operator()(const BlobStorage& b) const { return b.hash; } | |||
}; | |||
}; | |||
std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage, | |||
BlobStorage::Hash>> | |||
std::unordered_map< | |||
std::string, | |||
std::unordered_map<BlobStorage, BlobStorage, BlobStorage::Hash>> | |||
m_cache; | |||
MGB_MUTEX m_mtx; | |||
template <typename Input> | |||
void read_cache(Input& inp); | |||
public: | |||
InFilePersistentCache() = default; | |||
InFilePersistentCache(const char* path); | |||
@@ -68,8 +70,7 @@ public: | |||
void dump_cache(const char* path); | |||
Maybe<Blob> get(const std::string& category, const Blob& key) override; | |||
void put(const std::string& category, const Blob& key, | |||
const Blob& value) override; | |||
void put(const std::string& category, const Blob& key, const Blob& value) override; | |||
bool support_dump_cache() override { return true; } | |||
}; | |||
} // namespace mgb | |||
@@ -40,7 +40,7 @@ public: | |||
const std::string& category, const Blob& key, const Blob& value) = 0; | |||
virtual bool support_dump_cache() { return false; } | |||
//! set an implementation; return the original implementation | |||
static std::shared_ptr<PersistentCache> set_impl( | |||
std::shared_ptr<PersistentCache> impl); | |||
@@ -18,6 +18,7 @@ | |||
#include "megbrain/opr/nn_int.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megbrain/utils/hash_ct.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megbrain_opr_safe_dump) | |||
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) { | |||
@@ -38,24 +39,34 @@ template <> | |||
void write_param(std::string& /* data */, const DType& /* dtype */) {} | |||
template <class Opr> | |||
struct OprDumpImpl { | |||
static std::string dump(const cg::OperatorNodeBase* opr_) { | |||
MIDOUT_B(Opr) | |||
auto&& opr = opr_->cast_final_safe<Opr>(); | |||
std::string data; | |||
write_param(data, opr.param()); | |||
return data; | |||
MIDOUT_E | |||
} | |||
}; | |||
struct OprDumpImpl; | |||
#define INST(_Opr) \ | |||
#define cb(_Opr) \ | |||
template <> \ | |||
struct OprDumpImpl<_Opr> { \ | |||
static std::string dump(const cg::OperatorNodeBase* opr_) { \ | |||
MIDOUT_B(_Opr) \ | |||
auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||
std::string data; \ | |||
auto opr_hash = MGB_HASH_STR(#_Opr); \ | |||
write_param(data, opr_hash); \ | |||
write_param(data, opr.param()); \ | |||
return data; \ | |||
MIDOUT_E \ | |||
} \ | |||
}; | |||
FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) | |||
#undef cb | |||
#define cb(_Opr) \ | |||
template <> \ | |||
struct OprDumpImpl<_Opr> { \ | |||
static std::string dump(const cg::OperatorNodeBase* opr_) { \ | |||
MIDOUT_B(_Opr) \ | |||
auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||
std::string data; \ | |||
auto opr_hash = MGB_HASH_STR(#_Opr); \ | |||
write_param(data, opr_hash); \ | |||
write_param(data, opr.param()); \ | |||
using ExecutionPolicy = megdnn::param::ExecutionPolicy; \ | |||
ExecutionPolicy policy{ \ | |||
@@ -66,11 +77,8 @@ struct OprDumpImpl { | |||
MIDOUT_E \ | |||
} \ | |||
}; | |||
INST(Convolution); | |||
INST(ConvBiasForward); | |||
INST(ConvolutionBackwardData); | |||
INST(PoolingForward); | |||
#undef INST | |||
FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb) | |||
#undef cb | |||
} // namespace | |||
namespace mgb { | |||
@@ -83,8 +91,9 @@ std::string opr_safe_dump(const cg::OperatorNodeBase* opr) { | |||
return OprDumpImpl<_Opr>::dump(opr); \ | |||
} else | |||
FOREACH_SUPPORTED_OPR(cb) { | |||
mgb_throw(InternalError, "unsupported operator(got:%s)", | |||
opr->dyn_typeinfo()->name); | |||
mgb_throw( | |||
InternalError, "unsupported operator(got:%s)", | |||
opr->dyn_typeinfo()->name); | |||
} | |||
#undef cb | |||
} | |||
@@ -16,10 +16,16 @@ | |||
namespace mgb { | |||
namespace gopt { | |||
namespace intl { | |||
#define FOREACH_SUPPORTED_OPR(cb) \ | |||
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \ | |||
cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \ | |||
cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt) | |||
#define FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) \ | |||
cb(WarpPerspective) cb(Resize) cb(Elemwise) cb(ElemwiseMultiType) cb(Concat) \ | |||
cb(PowC) cb(TypeCvt) | |||
#define FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb) \ | |||
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward) | |||
#define FOREACH_SUPPORTED_OPR(cb) \ | |||
FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) \ | |||
FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb) | |||
std::string opr_safe_dump(const cg::OperatorNodeBase* opr); | |||
@@ -11,8 +11,8 @@ | |||
*/ | |||
#include "./opr_safe_dump.h" | |||
#include "megbrain/gopt/profiler.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/gopt/profiler.h" | |||
using namespace mgb; | |||
using namespace gopt; | |||
@@ -21,9 +21,6 @@ using ReformatKey = ReformatManager::ReformatKey; | |||
// =================== ProfilerCache ====================== | |||
void ProfilerCache::Key::build_blob_from_opr() { | |||
auto&& opr = m_key_impl.opr_key.opr; | |||
// process opr type | |||
auto type = opr->dyn_typeinfo()->name; | |||
size_t type_size = strlen(type); | |||
// process opr param | |||
auto data = intl::opr_safe_dump(opr); | |||
@@ -32,11 +29,7 @@ void ProfilerCache::Key::build_blob_from_opr() { | |||
size_t nr_inputs = opr->input().size(); | |||
size_t nr_outputs = opr->usable_output().size(); | |||
size_t nr_layouts = nr_inputs + nr_outputs; | |||
m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size + | |||
param_size); | |||
// serialize opr type | |||
m_blob_storage.append(type, type_size); | |||
m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + param_size); | |||
// serialize param | |||
const char* data_ptr = reinterpret_cast<const char*>(data.data()); | |||
@@ -70,12 +63,12 @@ void ProfilerCache::Key::build_blob_from_opr() { | |||
} | |||
// serialize opr_format | |||
m_blob_storage.append(std::to_string( | |||
static_cast<uint32_t>(m_key_impl.opr_key.opr_format))); | |||
m_blob_storage.append( | |||
std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.opr_format))); | |||
// serialize extra_attribute | |||
m_blob_storage.append(std::to_string( | |||
static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute))); | |||
m_blob_storage.append( | |||
std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute))); | |||
} | |||
void ProfilerCache::Key::build_category(CompNode cn) { | |||
@@ -85,8 +78,8 @@ void ProfilerCache::Key::build_category(CompNode cn) { | |||
#if MGB_CUDA | |||
case CompNode::DeviceType::CUDA: { | |||
auto&& prop = env.cuda_env().device_prop; | |||
m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name, | |||
prop.major, prop.minor); | |||
m_category += ssprintf( | |||
"plat=cuda;dev=%s;cap=%d.%d", prop.name, prop.major, prop.minor); | |||
break; | |||
} | |||
#endif | |||
@@ -94,9 +87,10 @@ void ProfilerCache::Key::build_category(CompNode cn) { | |||
m_category += "plat=cpu"; | |||
break; | |||
default: | |||
mgb_throw(MegBrainError, | |||
"unsupported comp node for global layout transform " | |||
"profiler cache category"); | |||
mgb_throw( | |||
MegBrainError, | |||
"unsupported comp node for global layout transform " | |||
"profiler cache category"); | |||
} | |||
} | |||
@@ -151,9 +145,10 @@ ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) { | |||
} | |||
void ProfilerCache::dump_cache(const char* path) { | |||
mgb_assert(m_impl->support_dump_cache(), | |||
"current impl of ProfilerCache does not support dump cache to " | |||
"file."); | |||
mgb_assert( | |||
m_impl->support_dump_cache(), | |||
"current impl of ProfilerCache does not support dump cache to " | |||
"file."); | |||
auto cache = static_cast<InFilePersistentCache*>(m_impl.get()); | |||
cache->dump_cache(path); | |||
} | |||
@@ -165,8 +160,9 @@ Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) { | |||
// data type of cost is float | |||
auto buf = static_cast<const uint8_t*>(raw_buf->ptr); | |||
auto size = raw_buf->size; | |||
mgb_assert(buf && size == sizeof(float), | |||
"ProfileCache invalid value: ptr=%p, size=%zu", buf, size); | |||
mgb_assert( | |||
buf && size == sizeof(float), | |||
"ProfileCache invalid value: ptr=%p, size=%zu", buf, size); | |||
auto read_f32 = [&]() { | |||
auto ret = *reinterpret_cast<const float*>(buf); | |||
return ret; | |||
@@ -154,33 +154,30 @@ void MarkInputContiguous::init_output_static_infer_desc() { | |||
} // namespace | |||
/* ================== ProfilerImpl =================*/ | |||
ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, | |||
float var_node_threshold) | |||
ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, float var_node_threshold) | |||
: m_opr_threshold{opr_threshold}, | |||
m_var_node_threshold{var_node_threshold}, | |||
m_runs{runs} { | |||
m_opr_filter = [this](const OperatorNodeBase* opr, | |||
OperatorNodeBase* new_opr) { | |||
m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) { | |||
/// \note: for the considerations of performance, we skip nchw(naive) | |||
/// kernels for conv bias on CUDA platform. to remove this later | |||
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) { | |||
if (conv->output(0)->comp_node().device_type() == | |||
CompNode::DeviceType::CUDA && | |||
conv->input(0)->dtype().category() == | |||
DTypeCategory::QUANTIZED && | |||
conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED && | |||
conv->param().format == OprFormat::NCHW) { | |||
return false; | |||
} | |||
} | |||
float comp1 = m_opr_footprint.get_computation( | |||
const_cast<OperatorNodeBase*>(opr)); | |||
float comp1 = | |||
m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr)); | |||
float comp2 = m_opr_footprint.get_computation(new_opr); | |||
if (comp2 > m_opr_threshold * comp1) | |||
return false; | |||
return true; | |||
}; | |||
m_var_node_filter = [this](const VarNode* var, TensorShape from, | |||
TensorShape to, ReformatKey key) { | |||
m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to, | |||
ReformatKey key) { | |||
/// \note: due to the alignment requirement of low-bit tensor, we skip | |||
/// some layout transform for low-bit tensors. The skipped layout | |||
/// transforms do not have corresponding dnn kernel and cannot be | |||
@@ -202,8 +199,7 @@ ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, | |||
TensorLayout orig_ly = {var->shape(), var->dtype()}, | |||
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; | |||
float orig_memory = orig_ly.span().dist_byte() * 2.f; | |||
float reformat_memory = | |||
from_ly.span().dist_byte() + to_ly.span().dist_byte(); | |||
float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte(); | |||
if (reformat_memory > orig_memory * m_var_node_threshold) | |||
return false; | |||
return true; | |||
@@ -537,23 +533,20 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() { | |||
return std::make_unique<ProfilerImpl>(); | |||
} | |||
std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler( | |||
const char* path) { | |||
std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(const char* path) { | |||
return std::make_unique<CachedProfiler>(path); | |||
} | |||
/* ================== CachedProfiler =================*/ | |||
CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold, | |||
float var_node_threshold) | |||
CachedProfiler::CachedProfiler( | |||
const char* path, int runs, float opr_threshold, float var_node_threshold) | |||
: ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} { | |||
if (m_path != nullptr) { // file cache | |||
ProfilerCache::inst().set_impl( | |||
std::make_unique<InFilePersistentCache>(m_path)); | |||
ProfilerCache::inst().set_impl(std::make_unique<InFilePersistentCache>(m_path)); | |||
} | |||
} | |||
CachedProfiler::ProfilingResult CachedProfiler::profile( | |||
const Problem& problem) const { | |||
CachedProfiler::ProfilingResult CachedProfiler::profile(const Problem& problem) const { | |||
auto ret = ProfilerImpl::profile(problem); | |||
if (m_path != nullptr) | |||
ProfilerCache::inst().dump_cache(m_path); | |||
@@ -563,35 +556,33 @@ CachedProfiler::ProfilingResult CachedProfiler::profile( | |||
float CachedProfiler::profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
TensorFormats tensor_format, ReformatAttribute extra_attribute) const { | |||
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format), | |||
extra_attribute}; | |||
ProfilerCache::Key key{ | |||
opr, tensor_formats_to_opr_format(tensor_format), extra_attribute}; | |||
auto ret = ProfilerCache::inst().get(key); | |||
if (ret.valid()) | |||
return ret.val(); | |||
auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format, | |||
extra_attribute); | |||
auto rst = ProfilerImpl::profile_operator( | |||
opr, base_format, tensor_format, extra_attribute); | |||
ProfilerCache::inst().put(key, rst); | |||
return rst; | |||
} | |||
float CachedProfiler::profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OperatorNodeBase* opr, const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute) const { | |||
ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; | |||
auto ret = ProfilerCache::inst().get(key); | |||
if (ret.valid()) | |||
return ret.val(); | |||
auto rst = ProfilerImpl::profile_operator(opr, base_config, config, | |||
extra_attribute); | |||
auto rst = | |||
ProfilerImpl::profile_operator(opr, base_config, config, extra_attribute); | |||
ProfilerCache::inst().put(key, rst); | |||
return rst; | |||
} | |||
float CachedProfiler::profile_var_node(const VarNode* var, | |||
TensorFormats base_format, | |||
const ReformatKey& key) const { | |||
float CachedProfiler::profile_var_node( | |||
const VarNode* var, TensorFormats base_format, const ReformatKey& key) const { | |||
ProfilerCache::Key pf_key{var, key}; | |||
auto ret = ProfilerCache::inst().get(pf_key); | |||
if (ret.valid()) | |||
@@ -78,7 +78,7 @@ public: | |||
const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>; | |||
ProfilerBase() = default; | |||
virtual ~ProfilerBase() = default; | |||
virtual ProfilingResult profile(const Problem& problem) const = 0; | |||
@@ -102,13 +102,12 @@ protected: | |||
VarNodeFilter m_var_node_filter; | |||
}; | |||
/*! \brief A default profiler impl | |||
*/ | |||
class ProfilerImpl : public ProfilerBase { | |||
public: | |||
ProfilerImpl(int runs = 10, float opr_threshold = 2.f, | |||
float var_node_threshold = 2.f); | |||
ProfilerImpl( | |||
int runs = 10, float opr_threshold = 2.f, float var_node_threshold = 2.f); | |||
~ProfilerImpl() = default; | |||
ProfilingResult profile(const Problem& problem) const override; | |||
@@ -128,22 +127,22 @@ protected: | |||
OperatorNodeRecord profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const; | |||
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||
/*! | |||
* \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) | |||
* \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, | |||
* typecvt etc.) | |||
* | |||
* \param opr pointer to the operator to be profiled | |||
* \param base_format the original tensor format of the operator node. | |||
* \param tensor_format the tensor format to be profiled | |||
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout | |||
* \return elapsed time of operator in the given tensor format configuration | |||
* \param extra_attribute identify whether to use image object for OpenCL or | |||
* automatically padding nhwc layout \return elapsed time of operator in the given | |||
* tensor format configuration | |||
*/ | |||
virtual float profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
TensorFormats tensor_format, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const; | |||
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||
/*! | |||
* \brief profile opr format aware operators (like conv, deconv, conv_bias, | |||
* etc.) | |||
@@ -157,28 +156,29 @@ protected: | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const SmallVector<OprTensorFormatsConfiguration>& available_configs, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const; | |||
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||
/*! | |||
* \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.) | |||
* \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, | |||
* warp etc.) | |||
* | |||
* \param opr pointer to the operator to be profiled | |||
* \param base_config the original opr format configuration of the operator node, | |||
* \param base_config the original opr format configuration of the operator node, | |||
* \param config the opr format configuration to be profiled | |||
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout | |||
* \return elapsed time of operator in the given opr format configuration | |||
* \param extra_attribute identify whether to use image object for OpenCL or | |||
* automatically padding nhwc layout \return elapsed time of operator in the given | |||
* opr format configuration | |||
*/ | |||
virtual float profile_operator(const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const; | |||
virtual float profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||
/*! | |||
* \brief profile layout transform of the var node | |||
* | |||
* \param var pointer to the var node to be profiled | |||
* \param base_format the original tensor formats in which the var node is | |||
* stored | |||
* stored | |||
* \param available_tensor_formats the available tensor formats | |||
* \param extra_attribute the extra attributes (options) of the problem | |||
* \return the var node record | |||
@@ -186,27 +186,26 @@ protected: | |||
VarNodeRecord profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const SmallVector<TensorFormats>& available_tensor_formats, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const; | |||
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||
/*! | |||
* \brief profile layout transform of the var node | |||
* | |||
* \param var pointer to the var node to be profiled | |||
* \param base_format the original tensor formats in which the var node is | |||
* stored | |||
* \param key type of ReformatKey, identify the information/attributes of the layout transoform | |||
* \return elapsed time of the layout transform | |||
* \param key type of ReformatKey, identify the information/attributes of the layout | |||
* transoform \return elapsed time of the layout transform | |||
*/ | |||
virtual float profile_var_node(const VarNode* var, | |||
TensorFormats base_format, | |||
const ReformatKey& key) const; | |||
virtual float profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const; | |||
OprFootprint m_opr_footprint; | |||
float m_opr_threshold; /// a threshold, when the computation of the newly | |||
/// created operator that is built in some opr | |||
/// format configuration is as greater as | |||
/// m_opr_threshold times of the original operator, | |||
/// the opr format configuration will be skipped | |||
/// (i.e. the cost is infinite) | |||
float m_opr_threshold; /// a threshold, when the computation of the newly | |||
/// created operator that is built in some opr | |||
/// format configuration is as greater as | |||
/// m_opr_threshold times of the original operator, | |||
/// the opr format configuration will be skipped | |||
/// (i.e. the cost is infinite) | |||
float m_var_node_threshold; /// a threshold, when the memory footprint of | |||
/// the layout transform of the var node is as | |||
/// larger as m_var_node_threshold as the var | |||
@@ -298,23 +297,26 @@ private: | |||
class CachedProfiler final : public ProfilerImpl { | |||
public: | |||
CachedProfiler(const char* path = nullptr, int runs = 10, | |||
float opr_threshold = 2.f, float var_node_threshold = 2.f); | |||
CachedProfiler( | |||
const char* path = nullptr, int runs = 10, float opr_threshold = 2.f, | |||
float var_node_threshold = 2.f); | |||
ProfilingResult profile(const Problem& problem) const override; | |||
private: | |||
float profile_operator(const OperatorNodeBase* opr, | |||
TensorFormats base_format, | |||
TensorFormats tensor_format, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override; | |||
float profile_operator(const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override; | |||
float profile_var_node(const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const override; | |||
float profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
TensorFormats tensor_format, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override; | |||
float profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override; | |||
float profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const override; | |||
const char* m_path; | |||
}; | |||
@@ -7,19 +7,21 @@ | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 结果存到了 cache 里去, | |||
# 每次跑测试会从内存里读取 cache 里的 profiling 结果,然后根据 profiling 结果去做全局图优化。 | |||
# 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件,用于测试时读取数据。 | |||
# 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试,则需要考虑用这个脚本来 | |||
# 处理一下 profiling 数据。 | |||
# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 数据存到了 cache 里去, | |||
# 每次跑测试会从内存 cache 里读取 profiling 结果,然后根据 profiling 结果去做全局图优化,这样确保每次运行 | |||
# 结果都是一致的。 | |||
# ProfilerCache 可以支持把内存中 cache 下来的 profiling 数据 dump 成文件。 | |||
# 这个脚本就是用于把 dump 出去的 cache 文件打包成 cache 的头文件,用于测试时读取数据,构建 InMemory 的 ProfilerCache 。 | |||
# 如果在 src/gopt/test/layout_transform_pass.cpp 里新添加了全局图优化相关的测试,则需要考虑用这个脚本来 | |||
# 更新 cache 头文件中的 profiling 数据。 | |||
# 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为 | |||
# `#define MGB_WITH_CACHED_TEST 0` | |||
# 2. 编译megbrain_test,并运行所有全局图优化相关测试: | |||
# ./megbrain_test --gtest_filter="*LayoutTransform*" | |||
# 3. 用这个脚本把所有的cache文件打包在一起 | |||
# python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache) | |||
# 4. 将步骤1中的 define 改回去,这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test , | |||
# 验证测试是否正确。 | |||
# 4. 将步骤1中的 define 语句改回原样,这样 profile 过程就会使用 cache 下来的数据。 | |||
# 5. 最后可以重新构建一下 megbrain_test ,确保测试结果正确。 | |||
import os.path | |||
import logging | |||
import hashlib | |||
@@ -78,8 +78,9 @@ OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) { | |||
case TensorFormats::CHWNc4: | |||
return OprFormat::CHWN4; | |||
default: | |||
mgb_throw(MegBrainError, "tensor format(%u) is not supported", | |||
static_cast<uint32_t>(tensor_format)); | |||
mgb_throw( | |||
MegBrainError, "tensor format(%u) is not supported", | |||
static_cast<uint32_t>(tensor_format)); | |||
} | |||
} | |||
@@ -92,28 +93,28 @@ public: | |||
} | |||
~ProfilerMock() { | |||
// reset in memory cache | |||
ProfilerCache::inst().set_impl( | |||
std::make_unique<InMemoryPersistentCache>()); | |||
ProfilerCache::inst().set_impl(std::make_unique<InMemoryPersistentCache>()); | |||
} | |||
private: | |||
float profile_operator(const OperatorNodeBase* opr, | |||
TensorFormats base_format, | |||
TensorFormats tensor_format, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override { | |||
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format), | |||
extra_attribute}; | |||
float profile_operator( | |||
const OperatorNodeBase* opr, TensorFormats base_format, | |||
TensorFormats tensor_format, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override { | |||
ProfilerCache::Key key{ | |||
opr, tensor_formats_to_opr_format(tensor_format), extra_attribute}; | |||
auto ret = ProfilerCache::inst().get(key); | |||
if (ret.valid()) | |||
return ret.val(); | |||
mgb_assert(false); | |||
} | |||
float profile_operator(const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override { | |||
float profile_operator( | |||
const OperatorNodeBase* opr, | |||
const OprTensorFormatsConfiguration& base_config, | |||
const OprTensorFormatsConfiguration& config, | |||
ReformatAttribute extra_attribute = | |||
ReformatAttribute::DEFAULT) const override { | |||
ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; | |||
std::string tmp; | |||
tmp.reserve(key.blob().size); | |||
@@ -122,8 +123,9 @@ private: | |||
return ret.val(); | |||
mgb_assert(false); | |||
} | |||
float profile_var_node(const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const override { | |||
float profile_var_node( | |||
const VarNode* var, TensorFormats base_format, | |||
const ReformatKey& key) const override { | |||
ProfilerCache::Key pf_key{var, key}; | |||
auto ret = ProfilerCache::inst().get(pf_key); | |||
if (ret.valid()) | |||
@@ -174,18 +176,17 @@ TEST(TestLayoutTransform, Resnet18_QS8) { | |||
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | |||
ReformatAttribute::AUTO_PADDING_NHWC}; | |||
auto ctx = std::make_unique<LayoutTransformContext>( | |||
std::move(opr_list), std::move(available_tensor_formats), | |||
attribute); | |||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||
OprFormat::NHWC}) | |||
.add_opr_config(opr::PoolingForward::typeinfo(), | |||
{OprFormat::NCHW4, OprFormat::NCHW32, | |||
OprFormat::NHWC, OprFormat::CHWN4}); | |||
std::move(opr_list), std::move(available_tensor_formats), attribute); | |||
ctx->add_opr_config( | |||
opr::ConvBiasForward::typeinfo(), | |||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC}) | |||
.add_opr_config( | |||
opr::PoolingForward::typeinfo(), | |||
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC, | |||
OprFormat::CHWN4}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>( | |||
TestLayoutTransform_Resnet18_QS8.data()), | |||
static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS8.data()), | |||
TestLayoutTransform_Resnet18_QS8.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||
@@ -278,8 +279,7 @@ TEST(TestLayoutTransform, Resnet18_QS4) { | |||
OprFormat::NHWC, OprFormat::CHWN4}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>( | |||
TestLayoutTransform_Resnet18_QS4.data()), | |||
static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS4.data()), | |||
TestLayoutTransform_Resnet18_QS4.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||
@@ -401,8 +401,7 @@ TEST(TestLayoutTransform, Detection_QS8) { | |||
OprFormat::NHWC, OprFormat::CHWN4}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>( | |||
TestLayoutTransform_Detection_QS8.data()), | |||
static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS8.data()), | |||
TestLayoutTransform_Detection_QS8.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||
@@ -479,8 +478,7 @@ TEST(TestLayoutTransform, Detection_QS4) { | |||
OprFormat::NHWC, OprFormat::CHWN4}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>( | |||
TestLayoutTransform_Detection_QS4.data()), | |||
static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS4.data()), | |||
TestLayoutTransform_Detection_QS4.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||
@@ -553,17 +551,16 @@ TEST(TestLayoutTransform, Wide) { | |||
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | |||
ReformatAttribute::DEFAULT}; | |||
auto ctx = std::make_unique<LayoutTransformContext>( | |||
std::move(opr_list), std::move(available_tensor_formats), | |||
attribute); | |||
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||
{OprFormat::NCHW, OprFormat::NHWC}); | |||
std::move(opr_list), std::move(available_tensor_formats), attribute); | |||
ctx->add_opr_config( | |||
opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()), | |||
TestLayoutTransform_Wide.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||
"TestLayoutTransform.Wide.cache"); | |||
auto profiler = | |||
ProfilerBase::make_cached_profiler("TestLayoutTransform.Wide.cache"); | |||
#endif | |||
std::unique_ptr<SolverBase> solver{ | |||
new DynamicProgrammingSolver(std::move(profiler))}; | |||
@@ -674,8 +671,7 @@ TEST(TestLayoutTransform, DetectionHead) { | |||
{OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); | |||
#if MGB_WITH_CACHED_TEST | |||
auto profiler = std::make_unique<ProfilerMock>( | |||
static_cast<const uint8_t*>( | |||
TestLayoutTransform_DetectionHead.data()), | |||
static_cast<const uint8_t*>(TestLayoutTransform_DetectionHead.data()), | |||
TestLayoutTransform_DetectionHead.size()); | |||
#else | |||
auto profiler = ProfilerBase::make_cached_profiler( | |||