GitOrigin-RevId: 743846f645
tags/v1.3.0
@@ -330,6 +330,8 @@ struct TensorLayout : public TensorShape { | |||
/* =================== properties =================== */ | |||
std::string to_string() const; | |||
std::string serialize() const; | |||
#endif // MEGDNN_CC_HOST | |||
/*! | |||
@@ -11,6 +11,7 @@ | |||
*/ | |||
#pragma once | |||
#include <type_traits> | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/handle.h" | |||
@@ -144,8 +145,11 @@ public: | |||
return {{handle_type(), type(), param()}, name(), is_reproducible()}; | |||
} | |||
Info::Desc desc() const { return {handle_type(), type(), param()}; } | |||
template <typename T> | |||
static void serialize_write_pod(const T& val, std::string& result) { | |||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||
result.append(reinterpret_cast<const char*>(&val), sizeof(T)); | |||
} | |||
@@ -155,6 +159,7 @@ public: | |||
template <typename T> | |||
static T deserialize_read_pod(const std::string& data, size_t offset = 0) { | |||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||
T ret; | |||
//! A pointer to an object or incomplete type may be converted to a | |||
//! pointer to a different object or incomplete type. If the resulting | |||
@@ -167,10 +172,69 @@ public: | |||
return ret; | |||
} | |||
template <typename T> | |||
static T deserialize_read_pod(const char* data, size_t offset = 0) { | |||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||
T ret; | |||
//! A pointer to an object or incomplete type may be converted to a | |||
//! pointer to a different object or incomplete type. If the resulting | |||
//! pointer is not correctly aligned for the pointed-to type, the | |||
//! behavior is undefined. | |||
//! | |||
//! so here we should use memcpy instead of | |||
//! *reinterpret_cast<const T*>(&data[offset]); | |||
memcpy(&ret, data + offset, sizeof(T)); | |||
return ret; | |||
} | |||
enum class OprType : uint32_t { | |||
MATRIX_MUL_FORWARD, | |||
BATCHED_MATRIX_MUL_FORWARD, | |||
CONVOLUTION_FORWARD, | |||
CONVOLUTION_BACKWARD_DATA, | |||
CONVOLUTION_BACKWARD_FILTER, | |||
CONVOLUTION3D_FORWARD, | |||
CONVOLUTION3D_BACKWARD_DATA, | |||
CONVOLUTION3D_BACKWARD_FILTER, | |||
LOCAL_SHARE_FORWARD, | |||
LOCAL_SHARE_BACKWARD_DATA, | |||
LOCAL_SHARE_BACKWARD_FILTER, | |||
DEFORMABLE_CONV_FORWARD, | |||
DEFORMABLE_CONV_BACKWARD_DATA, | |||
DEFORMABLE_CONV_BACKWARD_FILTER, | |||
CONVBIAS_FORWARD, | |||
BATCH_CONV_FORWARD, | |||
}; | |||
struct SearchItem { | |||
OprType opr_type; | |||
//! serialized param | |||
std::string param; | |||
TensorLayoutArray layouts; | |||
}; | |||
/** | |||
* \brief get subopr list of the algo | |||
* | |||
* \param layouts origin layouts of the parent opr | |||
* \param opr parent opr | |||
*/ | |||
virtual std::vector<SearchItem> get_subopr_list(const TensorLayoutArray&, | |||
const OperatorBase*) const { | |||
return {}; | |||
} | |||
protected: | |||
Handle::HandleType m_handle_type = Handle::HandleType::NAIVE; | |||
}; | |||
//! policy for executing the operator | |||
struct ExecutionPolicy { | |||
//! INVALID_ALGO_TYPE algo_type means using heuristic | |||
Algorithm::Info::Desc algo; | |||
std::vector<ExecutionPolicy> sub_policy; | |||
}; | |||
/*! | |||
* \brief define Algorithm and ExecutionPolicy for oprs that have | |||
* multiple impl algos | |||
@@ -198,12 +262,6 @@ public: | |||
*/ | |||
virtual const char* get_algorithm_set_name() const = 0; | |||
//! policy for executing the operator | |||
struct ExecutionPolicy { | |||
//! INVALID_ALGO_TYPE algo_type means using heuristic | |||
AlgorithmInfo algo; | |||
}; | |||
ExecutionPolicy& execution_policy() { return m_execution_policy; } | |||
const ExecutionPolicy& execution_policy() const { | |||
@@ -464,6 +522,9 @@ protected: | |||
bool reproducible = false) = 0; | |||
}; | |||
} // namespace detail | |||
using Algorithm = detail::Algorithm; | |||
using ExecutionPolicy = detail::ExecutionPolicy; | |||
} // namespace megdnn | |||
#include "megdnn/internal/visibility_epilogue.h" | |||
@@ -25,17 +25,17 @@ namespace megdnn { | |||
*/ | |||
template <class Opr, typename... Args> | |||
typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { | |||
typename Opr::AlgorithmInfo ret; | |||
typename Opr::AlgorithmDesc ret; | |||
auto set = opr->execution_policy().algo; | |||
if (set.valid()) { | |||
ret = set; | |||
} else { | |||
ret = opr->get_algorithm_info_heuristic( | |||
std::forward<Args>(args)..., std::numeric_limits<size_t>::max(), | |||
false); | |||
false).desc; | |||
} | |||
return static_cast<typename Opr::AlgoBase*>( | |||
opr->get_algorithm_from_desc(ret.desc)); | |||
opr->get_algorithm_from_desc(ret)); | |||
} | |||
/*! | |||
@@ -46,7 +46,7 @@ template <class Opr, typename... Args> | |||
typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) { | |||
auto set = opr->execution_policy().algo; | |||
if (set.valid()) { | |||
return opr->algo_pack().construct_and_get_algo(set.desc); | |||
return opr->algo_pack().construct_and_get_algo(set); | |||
} else { | |||
return static_cast<typename Opr::AlgoBase*>( | |||
opr->get_algorithm_heuristic(std::forward<Args>(args)..., | |||
@@ -20,6 +20,7 @@ | |||
#include <mutex> | |||
#include <numeric> | |||
#include <tuple> | |||
#include <type_traits> | |||
using namespace megdnn; | |||
@@ -35,6 +36,26 @@ class DefaultErrorHandler final : public ErrorHandler { | |||
#endif | |||
} | |||
}; | |||
template <typename T> | |||
void serialize_pod(const T& val, std::string& result) { | |||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||
result.append(reinterpret_cast<const char*>(&val), sizeof(T)); | |||
} | |||
template <typename T> | |||
void serialize_vec(const T* val, size_t size, std::string& result) { | |||
result.append(reinterpret_cast<const char*>(val), sizeof(T) * size); | |||
} | |||
template <typename T> | |||
T deserialize_pod(const std::string& data, size_t& offset) { | |||
T ret; | |||
memcpy(&ret, data.data() + offset, sizeof(T)); | |||
offset += sizeof(T); | |||
return ret; | |||
} | |||
} // namespace | |||
ErrorHandler* ErrorHandler::sm_inst; | |||
@@ -126,17 +147,23 @@ bool TensorShape::eq_shape(const TensorShape& rhs) const { | |||
size_t eq = 0; | |||
switch (ndim) { | |||
case 7: | |||
eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU | |||
eq += shape[6] == rhs.shape[6]; | |||
MEGDNN_FALLTHRU | |||
case 6: | |||
eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU | |||
eq += shape[5] == rhs.shape[5]; | |||
MEGDNN_FALLTHRU | |||
case 5: | |||
eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU | |||
eq += shape[4] == rhs.shape[4]; | |||
MEGDNN_FALLTHRU | |||
case 4: | |||
eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU | |||
eq += shape[3] == rhs.shape[3]; | |||
MEGDNN_FALLTHRU | |||
case 3: | |||
eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU | |||
eq += shape[2] == rhs.shape[2]; | |||
MEGDNN_FALLTHRU | |||
case 2: | |||
eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU | |||
eq += shape[1] == rhs.shape[1]; | |||
MEGDNN_FALLTHRU | |||
case 1: | |||
eq += shape[0] == rhs.shape[0]; | |||
} | |||
@@ -435,8 +462,8 @@ bool TensorLayout::try_reshape(TensorLayout& result, | |||
for (size_t i = 0; i < tshp.ndim; ++i) { | |||
if (!tshp.shape[i]) { | |||
megdnn_throw_if(!format.is_default(), tensor_reshape_error, | |||
megdnn_mangle(ssprintf("bad target tshp: %s", | |||
tshp.to_string().c_str()))); | |||
megdnn_mangle(ssprintf("bad target tshp: %s", | |||
tshp.to_string().c_str()))); | |||
is_empty_shape = true; | |||
break; | |||
} | |||
@@ -510,8 +537,36 @@ std::string TensorLayout::to_string() const { | |||
rst.append(" @ "); | |||
rst.append(format.impl()->to_string()); | |||
} | |||
rst.append(std::string(" ") + dtype.name()); | |||
rst.append("}"); | |||
return rst; | |||
} | |||
std::string TensorLayout::serialize() const { | |||
std::string rst; | |||
serialize_pod<size_t>(ndim, rst); | |||
serialize_vec<size_t>(shape, ndim, rst); | |||
serialize_vec<ptrdiff_t>(stride, ndim, rst); | |||
rst.append(format.impl()->to_string()); | |||
//! serialize dtype | |||
serialize_pod(dtype.enumv(), rst); | |||
if (dtype.has_param()) { | |||
switch (dtype.enumv()) { | |||
#define cb(_dt) \ | |||
case DTypeTrait<dtype::_dt>::enumv: \ | |||
serialize_pod(dtype::_dt::downcast_from(dtype).param(), rst); \ | |||
break; | |||
MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) | |||
#undef cb | |||
default: | |||
megdnn_assert(false, | |||
"cannot serialize unknown parameterized DType"); | |||
break; | |||
} | |||
} | |||
return rst; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -24,7 +24,7 @@ bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available( | |||
const SizeArgs& args) const { | |||
MatrixMulForwardImpl mm{args.opr->handle()}; | |||
mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; | |||
mm.execution_policy() = {m_algorithm->info()}; | |||
mm.execution_policy() = {m_algorithm->desc(), {}}; | |||
auto mm_layout_a = args.layout_a.remove_axis(0); | |||
auto mm_layout_b = args.layout_b.remove_axis(0); | |||
@@ -39,7 +39,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes( | |||
auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | |||
mm_opr->param() = {args.opr->param().transposeA, | |||
args.opr->param().transposeB}; | |||
mm_opr->execution_policy() = {m_algorithm->info()}; | |||
mm_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||
return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, | |||
args.layout_c); | |||
@@ -50,7 +50,7 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec( | |||
auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | |||
mm_opr->param() = {args.opr->param().transposeA, | |||
args.opr->param().transposeB}; | |||
mm_opr->execution_policy() = {m_algorithm->info()}; | |||
mm_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||
rep(n, N) { | |||
TensorND A_, B_, C_; | |||
auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { | |||
@@ -47,7 +47,7 @@ ConvBiasForwardImpl::AlgoBFloat16::float_args( | |||
change_dtype(fdst); | |||
opr->param() = args.opr->param(); | |||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
opr->execution_policy() = {m_impl->info()}; | |||
opr->execution_policy() = {m_impl->desc(), {}}; | |||
return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); | |||
} | |||
@@ -110,7 +110,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||
auto convbias_opr = args.handle->create_operator<ConvBias>(); | |||
convbias_opr->param() = args.opr->param(); | |||
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
convbias_opr->execution_policy() = {m_impl->info()}; | |||
convbias_opr->execution_policy() = {m_impl->desc(), {}}; | |||
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, | |||
fdst_tensor, nullptr, cvter.workspace()); | |||
} | |||
@@ -46,12 +46,8 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||
megdnn_assert(all_algos_data == all_algos.data()); | |||
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul | |||
size_t algo_size = all_algos.size(); | |||
for (size_t i=0; i<algo_size; ++i) { | |||
bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i])); | |||
all_algos.push_back(bfloat16_refhold.back().get()); | |||
bfloat16_algos.push_back(bfloat16_refhold.back().get()); | |||
} | |||
all_algos.push_back(&bfloat16); | |||
bfloat16_algos.push_back(&bfloat16); | |||
for (auto&& algo : all_algos) { | |||
m_all_algos_map.emplace(algo->info().desc, algo); | |||
@@ -170,28 +170,22 @@ public: | |||
class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { | |||
public: | |||
AlgoBFloat16(ConvolutionBackwardDataImpl::AlgoBase*); | |||
bool is_available(const SizeArgs& args) const override; | |||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||
void exec(const ExecArgs& args) const override; | |||
const char* name() const override { return m_name.c_str(); } | |||
std::vector<SearchItem> get_subopr_list( | |||
const TensorLayoutArray& layouts, | |||
const OperatorBase* opr) const override; | |||
const char* name() const override { | |||
return "CONVOLUTION_BACKWARD_DATD_BFLOAT16"; | |||
} | |||
bool is_reproducible() const override { return true; } | |||
private: | |||
std::string m_name; | |||
ConvolutionBackwardDataImpl::AlgoBase* m_algorithm = nullptr; | |||
SizeArgs float_args(const SizeArgs& args, ConvolutionBackwardDataImpl* opr, | |||
TensorLayout& fsrc, TensorLayout& ffilter, | |||
TensorLayout& fdst) const; | |||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||
MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) | |||
std::string param() const override { | |||
std::string ret; | |||
serialize_write_pod(m_algorithm, ret); | |||
return ret; | |||
} | |||
}; | |||
//! implement group conv by another algo | |||
@@ -237,7 +231,7 @@ public: | |||
AlgoChanwiseSmall chanwise_small; | |||
std::vector<AlgoGroupConvGeneral> gconv; | |||
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | |||
std::vector<std::unique_ptr<AlgoBFloat16>> bfloat16_refhold; | |||
AlgoBFloat16 bfloat16; | |||
std::vector<AlgoBase*> | |||
//! all algorithms | |||
@@ -17,33 +17,39 @@ using namespace megdnn; | |||
using namespace cuda; | |||
using namespace convolution; | |||
ConvolutionBackwardDataImpl::AlgoBFloat16::AlgoBFloat16( | |||
ConvolutionBackwardDataImpl::AlgoBase* algorithm) | |||
: m_algorithm(algorithm) { | |||
megdnn_assert_internal(algorithm); | |||
m_name = ssprintf("CONVOLUTION_BACKWARD_DATD_BFLOAT16:%s", | |||
m_algorithm->name()); | |||
} | |||
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs | |||
ConvolutionBackwardDataImpl::AlgoBFloat16::float_args( | |||
const SizeArgs& args, ConvolutionBackwardDataImpl* opr, | |||
TensorLayout& ffilter, TensorLayout& fdiff, TensorLayout& fgrad) const { | |||
ffilter = *args.filter_layout; | |||
fdiff = *args.diff_layout; | |||
fgrad = *args.grad_layout; | |||
namespace { | |||
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> sub_opr_config( | |||
const TensorLayoutArray& layouts, | |||
const ConvolutionBackwardDataImpl* opr) { | |||
megdnn_assert(layouts.size() >= 3); | |||
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> ret; | |||
ret.first = layouts; | |||
auto change_dtype = [](TensorLayout& layout) { | |||
if (layout.dtype == dtype::BFloat16()) { | |||
layout.dtype = dtype::Float32(); | |||
} | |||
}; | |||
change_dtype(ffilter); | |||
change_dtype(fdiff); | |||
change_dtype(fgrad); | |||
opr->param() = args.opr->param(); | |||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
opr->execution_policy() = {m_algorithm->info()}; | |||
return SizeArgs(opr, ffilter, fdiff, fgrad); | |||
change_dtype(ret.first[0]); | |||
change_dtype(ret.first[1]); | |||
change_dtype(ret.first[2]); | |||
ret.second = opr->param(); | |||
ret.second.compute_mode = | |||
ConvolutionBackwardData::Param::ComputeMode::DEFAULT; | |||
return ret; | |||
} | |||
} | |||
std::vector<Algorithm::SearchItem> | |||
ConvolutionBackwardDataImpl::AlgoBFloat16::get_subopr_list( | |||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||
auto&& config = sub_opr_config( | |||
layouts, static_cast<const ConvolutionBackwardDataImpl*>(opr)); | |||
std::string param_str; | |||
Algorithm::serialize_write_pod(config.second, param_str); | |||
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str, | |||
config.first}}; | |||
} | |||
bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( | |||
@@ -51,24 +57,30 @@ bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( | |||
TensorLayout ffilter, fdiff, fgrad; | |||
auto conv_back_data_opr = | |||
args.handle->create_operator<ConvolutionBackwardData>(); | |||
SizeArgs fargs = float_args( | |||
args, | |||
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()), | |||
ffilter, fdiff, fgrad); | |||
auto&& config = sub_opr_config( | |||
{*args.filter_layout, *args.diff_layout, *args.grad_layout}, | |||
args.opr); | |||
conv_back_data_opr->param() = config.second; | |||
return args.diff_layout->dtype == args.filter_layout->dtype && | |||
args.diff_layout->dtype == dtype::BFloat16() && | |||
m_algorithm->is_available(fargs); | |||
get_algorithm(static_cast<ConvolutionBackwardDataImpl*>( | |||
conv_back_data_opr.get()), | |||
config.first[0], config.first[1], config.first[2]); | |||
} | |||
WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( | |||
void* ptr, const SizeArgs& args) const { | |||
TensorLayout ffilter, fdiff, fgrad; | |||
auto conv_back_data_opr = | |||
args.handle->create_operator<ConvolutionBackwardData>(); | |||
SizeArgs fargs = float_args( | |||
args, | |||
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()), | |||
ffilter, fdiff, fgrad); | |||
if (args.opr->execution_policy().algo.valid()) { | |||
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||
conv_back_data_opr->execution_policy() = | |||
args.opr->execution_policy().sub_policy[0]; | |||
} | |||
auto&& config = sub_opr_config( | |||
{*args.filter_layout, *args.diff_layout, *args.grad_layout}, | |||
args.opr); | |||
conv_back_data_opr->param() = config.second; | |||
SmallVector<size_t> sizes; | |||
auto get_workspace = [&sizes](const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
@@ -76,10 +88,12 @@ WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( | |||
sizes.push_back(dst.span().dist_byte()); | |||
} | |||
}; | |||
get_workspace(*args.filter_layout, ffilter); | |||
get_workspace(*args.diff_layout, fdiff); | |||
get_workspace(*args.grad_layout, fgrad); | |||
sizes.push_back(m_algorithm->get_workspace_in_bytes(fargs)); | |||
get_workspace(*args.filter_layout, config.first[0]); | |||
get_workspace(*args.diff_layout, config.first[1]); | |||
get_workspace(*args.grad_layout, config.first[2]); | |||
sizes.push_back(conv_back_data_opr->get_workspace_in_bytes( | |||
config.first[0], config.first[1], config.first[2])); | |||
return {ptr, std::move(sizes)}; | |||
} | |||
@@ -103,9 +117,13 @@ void ConvolutionBackwardDataImpl::AlgoBFloat16::exec( | |||
{ | |||
auto conv_back_data_opr = | |||
args.handle->create_operator<ConvolutionBackwardData>(); | |||
if (args.opr->execution_policy().algo.valid()) { | |||
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||
conv_back_data_opr->execution_policy() = | |||
args.opr->execution_policy().sub_policy[0]; | |||
} | |||
conv_back_data_opr->param() = args.opr->param(); | |||
conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
conv_back_data_opr->execution_policy() = {m_algorithm->info()}; | |||
conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor, | |||
cvter.workspace()); | |||
} | |||
@@ -42,7 +42,7 @@ ConvolutionBackwardFilterImpl::AlgoBFloat16::float_args( | |||
change_dtype(fgrad); | |||
opr->param() = args.opr->param(); | |||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
opr->execution_policy() = {m_algorithm->info()}; | |||
opr->execution_policy() = {m_algorithm->desc(), {}}; | |||
return SizeArgs(opr, fsrc, fdiff, fgrad); | |||
} | |||
@@ -107,7 +107,7 @@ void ConvolutionBackwardFilterImpl::AlgoBFloat16::exec( | |||
conv_back_filter_opr->param() = args.opr->param(); | |||
conv_back_filter_opr->param().compute_mode = | |||
Param::ComputeMode::DEFAULT; | |||
conv_back_filter_opr->execution_policy() = {m_algorithm->info()}; | |||
conv_back_filter_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||
conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor, | |||
cvter.workspace()); | |||
} | |||
@@ -69,7 +69,7 @@ ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src, | |||
conv_param.dilate_h, | |||
conv_param.dilate_w, | |||
conv_param.compute_mode}; | |||
ret.convbias_opr->execution_policy() = {this->execution_policy().algo}; | |||
ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||
return ret; | |||
} | |||
@@ -102,7 +102,7 @@ ConvolutionForwardImpl::get_algorithm_from_desc( | |||
conv_param.dilate_h, | |||
conv_param.dilate_w, | |||
conv_param.compute_mode}; | |||
convbias_opr->execution_policy() = {this->execution_policy().algo}; | |||
convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||
return static_cast<ConvBiasForwardImpl*>(convbias_opr.get()) | |||
->get_algorithm_from_desc(desc); | |||
@@ -160,7 +160,7 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | |||
auto algo = get_algorithm(this, filter.layout, args.filter_meta, | |||
auto algo = get_algorithm(this, filter.layout, | |||
diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
} | |||
@@ -83,6 +83,17 @@ public: | |||
workspace_limit_in_bytes, reproducible) | |||
->info(); | |||
} | |||
AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad, | |||
size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
return get_algorithm_heuristic(filter, diff, grad, | |||
workspace_limit_in_bytes, reproducible) | |||
->info(); | |||
} | |||
size_t get_workspace_in_bytes(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) override; | |||
@@ -82,7 +82,7 @@ void MatrixMulForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||
args.opr->handle()->create_operator<MatrixMulForward>(); | |||
matmul_opr->param() = args.opr->param(); | |||
matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
matmul_opr->execution_policy() = {m_algorithm->info()}; | |||
matmul_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||
matmul_opr->exec(a, b, c, ctypecvt.workspace()); | |||
} | |||
ctypecvt.comp_to_dst_type(c, args.tensor_c); | |||
@@ -1,6 +1,5 @@ | |||
/** | |||
* \file dnn/src/fallback/conv_bias/opr_impl.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
g * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
@@ -367,7 +366,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
"should be equal"); | |||
auto&& fm = check_layout_fwd(src, filter, dst); | |||
auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm); | |||
size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) | |||
->megcore_dispatcher() | |||
->nr_threads(); | |||
@@ -495,7 +494,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_from_desc( | |||
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | |||
const NCBKernSizeParam& param, size_t workspace_size) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||
return algo; | |||
} | |||
if (!m_prev_selected_algo || | |||
@@ -387,7 +387,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_from_desc( | |||
ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm( | |||
const NCBKernSizeParam& param, size_t workspace_size) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||
return algo; | |||
} | |||
if (!m_prev_selected_algo || | |||
@@ -783,7 +783,7 @@ ConvolutionBackwardDataImpl::get_algorithm_from_desc( | |||
ConvolutionBackwardDataImpl::Algorithm* | |||
ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||
return algo; | |||
} | |||
if (!m_prev_selected_algo || | |||
@@ -134,7 +134,7 @@ MatrixMul::Algorithm* MatrixMulImpl::get_algorithm_heuristic( | |||
size_t workspace_limit_in_bytes, bool reproducible) { | |||
auto kern_size_param = make_kern_size_param(A, B, C); | |||
if (auto algo = static_cast<AlgoBase*>( | |||
get_algorithm_from_desc(execution_policy().algo.desc))) { | |||
get_algorithm_from_desc(execution_policy().algo))) { | |||
megdnn_assert(algo->get_workspace(kern_size_param) < | |||
workspace_limit_in_bytes); | |||
auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo, | |||
@@ -382,7 +382,7 @@ float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||
for (auto i : algos) { | |||
if (std::regex_match(i.name, | |||
std::regex("(" + algo_base + ")(.*)"))) { | |||
opr->execution_policy().algo = i; | |||
opr->execution_policy().algo = i.desc; | |||
auto used = benchmark.exec(layouts); | |||
min_used = std::min(min_used, used); | |||
printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(), | |||
@@ -242,6 +242,11 @@ public: | |||
return *this; | |||
} | |||
Checker& reset_before_exec_callback() { | |||
m_before_exec_callback = nullptr; | |||
return *this; | |||
} | |||
//! set a tensors constraints function, for the purpose of manipulating | |||
//! tensors when testing. | |||
Checker& set_tensors_constraint( | |||
@@ -435,6 +440,17 @@ public: | |||
Testcase operator=(const Testcase&) = delete; | |||
}; | |||
struct ExecutionPolicyAlgoName { | |||
std::string name; | |||
std::vector<ExecutionPolicyAlgoName> sub_policy_names; | |||
ExecutionPolicyAlgoName(const char* name) : name{name} {} | |||
ExecutionPolicyAlgoName( | |||
const char* name, | |||
const std::vector<ExecutionPolicyAlgoName>& sub_policy) | |||
: name{name}, sub_policy_names{sub_policy} {} | |||
}; | |||
/*! | |||
* \brief a callable to check that given algorithm is used for heuristic | |||
* \param require_algo if its value is true, then requires | |||
@@ -444,48 +460,76 @@ public: | |||
*/ | |||
template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>> | |||
class AlgoChecker { | |||
std::string m_name; | |||
typename Opr::Algorithm* m_algo = nullptr; | |||
bool* m_require_algo; | |||
public: | |||
AlgoChecker(const char* name, bool* require_algo = nullptr) | |||
: m_name{name}, m_require_algo{require_algo} {} | |||
AlgoChecker(typename Opr::Algorithm* algo, bool* require_algo = nullptr) | |||
: m_algo{algo}, m_require_algo{require_algo} {} | |||
AlgoChecker(ExecutionPolicyAlgoName name, bool* require_algo = nullptr) | |||
: m_policy_name{name}, m_require_algo{require_algo} {} | |||
AlgoChecker(ExecutionPolicy policy, bool* require_algo = nullptr) | |||
: m_policy{policy}, m_require_algo{require_algo} {} | |||
static ExecutionPolicy construct_execution_policy_from_name( | |||
const ExecutionPolicyAlgoName& policy_name, | |||
const TensorLayoutArray& layouts, const std::string& param, | |||
Handle* handle) { | |||
ExecutionPolicy ret; | |||
megdnn_assert(layouts.size() == OprTrait<Opr>::arity); | |||
auto opr = handle->create_operator<Opr>(); | |||
opr->param() = | |||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
for (auto algo_info : | |||
AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info( | |||
opr.get(), layouts)) { | |||
if (std::regex_match( | |||
algo_info.name, | |||
std::regex("(" + policy_name.name + ")(.*)"))) { | |||
ret.algo = algo_info.desc; | |||
} else { | |||
continue; | |||
} | |||
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); | |||
std::vector<Algorithm::SearchItem>&& sub_items = | |||
algo->get_subopr_list(layouts, opr.get()); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
ExecutionPolicy policy = | |||
AlgoChecker<_Opr>::construct_execution_policy_from_name( | |||
policy_name.sub_policy_names[_item_idx], | |||
_item.layouts, _item.param, handle); | |||
ret.sub_policy.push_back(policy); | |||
}); | |||
return ret; | |||
} | |||
return ret; | |||
} | |||
void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) { | |||
TensorLayoutArray layouts; | |||
for (auto&& val : arr) { | |||
layouts.push_back(val.layout); | |||
} | |||
if (!m_policy_name.name.empty()) { | |||
std::string param_str; | |||
Algorithm::serialize_write_pod(opr->param(), param_str); | |||
m_policy = construct_execution_policy_from_name( | |||
m_policy_name, layouts, param_str, opr->handle()); | |||
ASSERT_TRUE(m_policy.algo.valid()) | |||
<< "algorithm " << m_policy_name.name << " not found"; | |||
} | |||
if (m_require_algo && *m_require_algo) { | |||
auto algo = | |||
OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts); | |||
if (m_name.empty()) { | |||
ASSERT_EQ(m_algo->name(), algo.name.c_str()); | |||
} else { | |||
ASSERT_TRUE(std::regex_match( | |||
algo.name.c_str(), std::regex("(" + m_name + ")(.*)"))); | |||
} | |||
ASSERT_STREQ(opr->get_algorithm_from_desc(m_policy.algo)->name(), | |||
algo.name.c_str()); | |||
} else { | |||
if (m_name.empty()) { | |||
opr->execution_policy().algo = m_algo->info(); | |||
return; | |||
} else { | |||
for (auto i : | |||
OprAlgoProxy::get_all_algorithms_info(opr, layouts)) { | |||
if (std::regex_match(i.name, | |||
std::regex("(" + m_name + ")(.*)"))) { | |||
opr->execution_policy().algo = i; | |||
return; | |||
} | |||
} | |||
} | |||
ASSERT_TRUE(false) << "algorithm " << m_name << " not found"; | |||
opr->execution_policy() = m_policy; | |||
} | |||
} | |||
private: | |||
ExecutionPolicyAlgoName m_policy_name; | |||
ExecutionPolicy m_policy; | |||
bool* m_require_algo; | |||
}; | |||
} // namespace test | |||
@@ -580,7 +580,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||
checker.set_rng(0, &rng).set_rng(1, &rng); | |||
for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { | |||
used_algos.insert(algo.desc); | |||
opr->execution_policy().algo = algo; | |||
opr->execution_policy().algo = algo.desc; | |||
checker | |||
.set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) | |||
.execs({ishp, fshp, {}}); | |||
@@ -599,7 +599,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||
opr->param() = param; | |||
for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { | |||
used_algos_bwd_data.insert(algo.desc); | |||
opr->execution_policy().algo = algo; | |||
opr->execution_policy().algo = algo.desc; | |||
checker_bwd_data | |||
.set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) | |||
.execl({fly, oly, ily}); | |||
@@ -620,7 +620,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||
opr->param() = param; | |||
for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { | |||
used_algos_bwd_flt.insert(algo.desc); | |||
opr->execution_policy().algo = algo; | |||
opr->execution_policy().algo = algo.desc; | |||
checker_bwd_filter | |||
.set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) | |||
.execl({ily, oly, fly}); | |||
@@ -0,0 +1,47 @@ | |||
/** | |||
* \file dnn/test/common/fast_run_cache.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "test/common/fast_run_cache.h" | |||
#include "src/common/utils.h" | |||
using namespace megdnn; | |||
using namespace test; | |||
FastRunCache::SearchItemStorage::SearchItemStorage( | |||
const Algorithm::SearchItem& item) { | |||
Algorithm::serialize_write_pod(item.opr_type, data_hold); | |||
for (auto&& layout : item.layouts) { | |||
data_hold += layout.serialize(); | |||
} | |||
data_hold += item.param; | |||
} | |||
Algorithm::Info::Desc FastRunCache::get(const Algorithm::SearchItem& key) { | |||
SearchItemStorage key_storage(key); | |||
key_storage.init_hash(); | |||
auto iter = m_cache.find(key_storage); | |||
if (iter == m_cache.end()) { | |||
return {}; | |||
} | |||
return iter->second; | |||
} | |||
void FastRunCache::put(const Algorithm::SearchItem& key, | |||
const Algorithm::Info::Desc& val) { | |||
SearchItemStorage key_storage(key); | |||
key_storage.init_hash(); | |||
megdnn_assert(m_cache.find(key_storage) == m_cache.end()); | |||
m_cache[std::move(key_storage)] = val; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,58 @@ | |||
/** | |||
* \file dnn/test/common/fast_run_cache.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megdnn/oprs.h" | |||
#include "src/common/hash_ct.h" | |||
#include <unordered_map> | |||
namespace megdnn { | |||
namespace test { | |||
class FastRunCache { | |||
struct SearchItemStorage { | |||
std::string data_hold; | |||
size_t hash = 0; | |||
SearchItemStorage(const Algorithm::SearchItem& item); | |||
SearchItemStorage& init_hash() { | |||
hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), | |||
20201225); | |||
return *this; | |||
} | |||
bool operator==(const SearchItemStorage& rhs) const { | |||
return data_hold == rhs.data_hold; | |||
} | |||
struct Hash { | |||
size_t operator()(const SearchItemStorage& s) const { | |||
return s.hash; | |||
} | |||
}; | |||
}; | |||
std::unordered_map<SearchItemStorage, Algorithm::Info::Desc, | |||
SearchItemStorage::Hash> | |||
m_cache; | |||
public: | |||
Algorithm::Info::Desc get(const Algorithm::SearchItem& key); | |||
void put(const Algorithm::SearchItem& key, | |||
const Algorithm::Info::Desc& val); | |||
}; | |||
} // namespace test | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -13,6 +13,7 @@ | |||
#include "test/common/deduce_layout_proxy.h" | |||
#include "test/common/exec_proxy.h" | |||
#include "test/common/fast_run_cache.h" | |||
#include "test/common/inspect_type.h" | |||
#include "test/common/opr_algo_proxy.h" | |||
#include "test/common/opr_trait.h" | |||
@@ -20,11 +21,104 @@ | |||
#include "test/common/workspace_wrapper.h" | |||
#include <algorithm> | |||
#include <limits> | |||
#include <memory> | |||
#include <unordered_map> | |||
namespace megdnn { | |||
namespace test { | |||
template <Algorithm::OprType> | |||
struct OprFromOprTypeTrait; | |||
template <typename Opr> | |||
struct OprTypeFromOprTrait; | |||
#define cb(_opr_type, _opr) \ | |||
template <> \ | |||
struct OprFromOprTypeTrait<Algorithm::OprType::_opr_type> { \ | |||
using Opr = megdnn::_opr; \ | |||
}; \ | |||
template <> \ | |||
struct OprTypeFromOprTrait<megdnn::_opr> { \ | |||
constexpr static Algorithm::OprType opr_type = \ | |||
Algorithm::OprType::_opr_type; \ | |||
} | |||
cb(MATRIX_MUL_FORWARD, MatrixMulForward); | |||
cb(CONVOLUTION_FORWARD, ConvolutionForward); | |||
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); | |||
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); | |||
cb(CONVOLUTION3D_FORWARD, Convolution3DForward); | |||
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); | |||
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); | |||
cb(LOCAL_SHARE_FORWARD, LocalShareForward); | |||
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); | |||
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); | |||
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); | |||
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); | |||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | |||
cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | |||
cb(CONVBIAS_FORWARD, ConvBiasForward); | |||
#undef cb | |||
// clang-format off | |||
#define FOREACH_OPR_TYPE(cb) \ | |||
cb(MATRIX_MUL_FORWARD) \ | |||
cb(CONVOLUTION_FORWARD) \ | |||
cb(CONVOLUTION_BACKWARD_DATA) \ | |||
cb(CONVOLUTION_BACKWARD_FILTER) \ | |||
cb(CONVOLUTION3D_FORWARD) \ | |||
cb(CONVOLUTION3D_BACKWARD_DATA) \ | |||
cb(CONVOLUTION3D_BACKWARD_FILTER) \ | |||
cb(LOCAL_SHARE_FORWARD) \ | |||
cb(LOCAL_SHARE_BACKWARD_DATA) \ | |||
cb(LOCAL_SHARE_BACKWARD_FILTER) \ | |||
cb(DEFORMABLE_CONV_FORWARD) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_DATA) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_FILTER) \ | |||
cb(BATCH_CONV_FORWARD) \ | |||
cb(CONVBIAS_FORWARD) | |||
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ | |||
cb(MATRIX_MUL_FORWARD, stmt) \ | |||
cb(CONVOLUTION_FORWARD, stmt) \ | |||
cb(CONVOLUTION_BACKWARD_DATA, stmt) \ | |||
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ | |||
cb(CONVOLUTION3D_FORWARD, stmt) \ | |||
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ | |||
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ | |||
cb(LOCAL_SHARE_FORWARD, stmt) \ | |||
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ | |||
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ | |||
cb(DEFORMABLE_CONV_FORWARD, stmt) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | |||
cb(BATCH_CONV_FORWARD, stmt) \ | |||
cb(CONVBIAS_FORWARD, stmt) | |||
// clang-format on | |||
#define _OPR_TYPE_CASE(_opr_type, _stmt) \ | |||
case Algorithm::OprType::_opr_type: { \ | |||
using _Opr = typename OprFromOprTypeTrait< \ | |||
Algorithm::OprType::_opr_type>::Opr; \ | |||
_stmt; \ | |||
break; \ | |||
} | |||
#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ | |||
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ | |||
_item_idx++) { \ | |||
auto&& _item = _search_items[_item_idx]; \ | |||
switch (_item.opr_type) { \ | |||
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ | |||
default: \ | |||
megdnn_throw("unknown opr_type"); \ | |||
} \ | |||
} | |||
template <typename Opr, size_t arity = OprTrait<Opr>::arity, | |||
bool has_workspace = OprTrait<Opr>::has_workspace, | |||
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | |||
@@ -130,10 +224,11 @@ struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | |||
}; | |||
//! OprProxy impl for tenary oprs with profiling support | |||
template <class Opr, int arity> | |||
template <class Opr> | |||
struct OprProxyProfilingBase | |||
: public DeduceLayoutProxy<Opr, arity, | |||
: public DeduceLayoutProxy<Opr, OprTrait<Opr>::arity, | |||
OprTrait<Opr>::can_deduce_layout> { | |||
static constexpr int arity = OprTrait<Opr>::arity; | |||
size_t warmup_times = 10, exec_times = 100; | |||
//! whether to enable profiling | |||
@@ -142,7 +237,7 @@ struct OprProxyProfilingBase | |||
//! target algo setup by profiler; it can also be directly specified by the | |||
//! caller | |||
typename Opr::AlgorithmInfo target_algo_info; | |||
ExecutionPolicy target_execution_policy; | |||
OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | |||
@@ -168,6 +263,154 @@ struct OprProxyProfilingBase | |||
return ret; | |||
} | |||
/** | |||
* flatten search space in postorder traversal | |||
* The subopr search construct a search tree | |||
* | |||
* A | |||
* / \ | |||
* B1B2 C | |||
* / \ | |||
* D1D2D3 E | |||
* We use postorder traverse the search tree. | |||
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A | |||
*/ | |||
static std::vector<Algorithm::SearchItem> flatten_search_space( | |||
const TensorLayoutArray layouts, const std::string& param, | |||
Handle* handle) { | |||
megdnn_assert(layouts.size() == arity); | |||
auto opr = handle->create_operator<Opr>(); | |||
opr->param() = | |||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
std::vector<Algorithm::SearchItem> ret; | |||
for (auto algo_info : AlgoProxy<Opr, arity>::get_all_algorithms_info( | |||
opr.get(), layouts)) { | |||
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); | |||
std::vector<Algorithm::SearchItem>&& sub_items = | |||
algo->get_subopr_list(layouts, opr.get()); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto space = OprProxyProfilingBase<_Opr>::flatten_search_space( | |||
_item.layouts, _item.param, handle); | |||
ret.insert(ret.end(), space.begin(), space.end()); | |||
}); | |||
} | |||
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||
return ret; | |||
} | |||
static void construct_execution_policy( | |||
const TensorLayoutArray& layouts, const std::string& param, | |||
Handle* handle, FastRunCache& cache, | |||
ExecutionPolicy& policy) { | |||
megdnn_assert(layouts.size() == arity); | |||
auto opr = handle->create_operator<Opr>(); | |||
opr->param() = | |||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
if (!policy.algo.valid()) { | |||
policy.algo = cache.get(Algorithm::SearchItem{ | |||
OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||
megdnn_assert(policy.algo.valid(), | |||
"No cache found, maybe some error occured in " | |||
"flatten_search_space or get_subopr_list"); | |||
} | |||
policy.sub_policy.clear(); | |||
Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); | |||
std::vector<Algorithm::SearchItem>&& sub_items = | |||
algo->get_subopr_list(layouts, opr.get()); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
policy.sub_policy.push_back({}); | |||
OprProxyProfilingBase<_Opr>::construct_execution_policy( | |||
_item.layouts, _item.param, handle, cache, | |||
policy.sub_policy.back()); | |||
}); | |||
return; | |||
} | |||
/** | |||
* \brief search and get the best execution_policy | |||
*/ | |||
static void search(const TensorLayoutArray& layouts, | |||
const std::string& param, | |||
WorkspaceWrapper& workspace_wrapper, Handle* handle, | |||
size_t warmup_times, size_t exec_times, | |||
FastRunCache& cache) { | |||
megdnn_assert(layouts.size() == arity); | |||
auto opr = handle->create_operator<Opr>(); | |||
opr->param() = | |||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
SmallVector<size_t> sizes_in_bytes; | |||
for (const auto& layout : layouts) { | |||
sizes_in_bytes.push_back(layout.span().dist_byte()); | |||
} | |||
float min_time = std::numeric_limits<float>::max(); | |||
Algorithm::Info::Desc best_algo; | |||
std::string log_info = "Profiling start: "; | |||
for (auto&& layout : layouts) { | |||
log_info += layout.to_string() + " "; | |||
} | |||
megdnn_log("%s", log_info.c_str()); | |||
best_algo = cache.get(Algorithm::SearchItem{ | |||
OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||
if (best_algo.valid()) { | |||
auto&& algo = opr->get_algorithm_from_desc(best_algo); | |||
MEGDNN_MARK_USED_VAR(algo); | |||
megdnn_log("Find best algo %s in cache", algo->name()); | |||
return; | |||
} | |||
for (auto algo : AlgoProxy<Opr, arity>::get_all_algorithms_info( | |||
opr.get(), layouts)) { | |||
//! construct execution_policy | |||
opr->execution_policy().algo = algo.desc; | |||
construct_execution_policy(layouts, param, handle, cache, | |||
opr->execution_policy()); | |||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||
opr.get(), layouts); | |||
sizes_in_bytes.push_back(workspace_size); | |||
WorkspaceBundle wb(nullptr, sizes_in_bytes); | |||
workspace_wrapper.update(wb.total_size_in_bytes()); | |||
wb.set(workspace_wrapper.workspace().raw_ptr); | |||
TensorNDArray tensors; | |||
for (size_t i = 0; i < arity; i++) { | |||
tensors.push_back({wb.get(i), layouts[i]}); | |||
} | |||
for (size_t times = 0; times < warmup_times; ++times) { | |||
AlgoProxy<Opr, arity>::exec(opr.get(), tensors, | |||
wb.get_workspace(arity)); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < exec_times; ++times) { | |||
AlgoProxy<Opr, arity>::exec(opr.get(), tensors, | |||
wb.get_workspace(arity)); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
megdnn_log("%.3fms %s", timer.get_time_in_us() / 1e3, | |||
algo.name.c_str()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
best_algo = algo.desc; | |||
} | |||
sizes_in_bytes.pop_back(); | |||
} | |||
auto&& algo = opr->get_algorithm_from_desc(best_algo); | |||
MEGDNN_MARK_USED_VAR(algo); | |||
megdnn_log("Profiling end, got best algo: %s", algo->name()); | |||
cache.put(Algorithm::SearchItem{OprTypeFromOprTrait<Opr>::opr_type, | |||
param, layouts}, | |||
best_algo); | |||
} | |||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == arity); | |||
if (!W.valid()) { | |||
@@ -177,39 +420,26 @@ struct OprProxyProfilingBase | |||
for (auto&& tensor : tensors) { | |||
layouts.push_back(tensor.layout); | |||
} | |||
if (m_profiling && !target_algo_info.valid()) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { | |||
opr->execution_policy().algo = algo; | |||
auto workspace_size = | |||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, | |||
layouts); | |||
W.update(workspace_size); | |||
for (size_t times = 0; times < warmup_times; ++times) | |||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < exec_times; ++times) { | |||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
algo.name.c_str()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
target_algo_info = algo; | |||
} | |||
} | |||
opr->execution_policy().algo = target_algo_info; | |||
if (m_profiling && !target_execution_policy.algo.valid()) { | |||
FastRunCache cache; | |||
std::string param_str; | |||
Algorithm::serialize_write_pod(opr->param(), param_str); | |||
auto&& search_items = | |||
flatten_search_space(layouts, param_str, opr->handle()); | |||
FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||
OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W, | |||
opr->handle(), warmup_times, | |||
exec_times, cache); | |||
}); | |||
construct_execution_policy(layouts, param_str, opr->handle(), cache, | |||
opr->execution_policy()); | |||
target_execution_policy = opr->execution_policy(); | |||
auto workspace_size = | |||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | |||
W.update(workspace_size); | |||
} | |||
if (!target_algo_info.valid()) { | |||
if (!target_execution_policy.algo.valid()) { | |||
auto workspace_size = | |||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | |||
W.update(workspace_size); | |||
@@ -218,30 +448,32 @@ struct OprProxyProfilingBase | |||
} | |||
}; | |||
#define DEF_PROF(c, arity) \ | |||
template <> \ | |||
struct OprProxy<c> : public OprProxyProfilingBase<c, arity> { \ | |||
using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \ | |||
#define DEF_PROF(c) \ | |||
template <> \ | |||
struct OprProxy<c> : public OprProxyProfilingBase<c> { \ | |||
using OprProxyProfilingBase<c>::OprProxyProfilingBase; \ | |||
} | |||
DEF_PROF(ConvolutionForward, 3); | |||
DEF_PROF(ConvolutionBackwardData, 3); | |||
DEF_PROF(ConvolutionBackwardFilter, 3); | |||
DEF_PROF(LocalShareForward, 3); | |||
DEF_PROF(LocalShareBackwardData, 3); | |||
DEF_PROF(LocalShareBackwardFilter, 3); | |||
DEF_PROF(MatrixMulForward); | |||
DEF_PROF(ConvolutionForward); | |||
DEF_PROF(ConvolutionBackwardData); | |||
DEF_PROF(ConvolutionBackwardFilter); | |||
DEF_PROF(LocalShareForward); | |||
DEF_PROF(LocalShareBackwardData); | |||
DEF_PROF(LocalShareBackwardFilter); | |||
DEF_PROF(DeformableConvForward, 5); | |||
DEF_PROF(DeformableConvBackwardFilter, 5); | |||
DEF_PROF(BatchConvBiasForward, 5); | |||
DEF_PROF(ConvBiasForward, 5); | |||
DEF_PROF(DeformableConvForward); | |||
DEF_PROF(DeformableConvBackwardFilter); | |||
DEF_PROF(BatchConvBiasForward); | |||
DEF_PROF(ConvBiasForward); | |||
DEF_PROF(DeformableConvBackwardData, 8); | |||
DEF_PROF(DeformableConvBackwardData); | |||
#undef DEF_PROF | |||
template <class Opr, int arity> | |||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||
using Base = OprProxyProfilingBase<Opr, arity>; | |||
template <class Opr> | |||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||
using Base = OprProxyProfilingBase<Opr>; | |||
static constexpr int arity = OprTrait<Opr>::arity; | |||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == arity); | |||
if (!Base::W.valid()) { | |||
@@ -252,11 +484,11 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||
for (auto&& tensor : tensors) { | |||
layouts.push_back(tensor.layout); | |||
} | |||
if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { | |||
if (Base::m_profiling && !Base::target_execution_policy.algo.valid()) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { | |||
opr->execution_policy().algo = algo; | |||
opr->execution_policy().algo = algo.desc; | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, algo.desc); | |||
@@ -288,12 +520,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||
algo.name.c_str()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
Base::target_algo_info = algo; | |||
Base::target_execution_policy.algo = algo.desc; | |||
} | |||
} | |||
opr->execution_policy().algo = Base::target_algo_info; | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo_info.desc); | |||
opr->execution_policy() = Base::target_execution_policy; | |||
auto preprocess_tensors = weight_prerocess( | |||
opr, tensors, Base::target_execution_policy.algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||
nullptr, *preprocess_tensors}; | |||
@@ -301,12 +533,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||
opr, layouts, &preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo_info.desc); | |||
auto preprocess_tensors = weight_prerocess( | |||
opr, tensors, Base::target_execution_policy.algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||
nullptr, *preprocess_tensors}; | |||
if (!Base::target_algo_info.valid()) { | |||
if (!Base::target_execution_policy.algo.valid()) { | |||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||
opr, layouts, &preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
@@ -342,16 +574,15 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||
} | |||
}; | |||
#define DEF_PROF(c, arity) \ | |||
template <> \ | |||
struct OprWeightPreprocessProxy<c> \ | |||
: public OprWeightPreprocessProxyImpl<c, arity> { \ | |||
using OprWeightPreprocessProxyImpl< \ | |||
c, arity>::OprWeightPreprocessProxyImpl; \ | |||
#define DEF_PROF(c) \ | |||
template <> \ | |||
struct OprWeightPreprocessProxy<c> \ | |||
: public OprWeightPreprocessProxyImpl<c> { \ | |||
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | |||
} | |||
DEF_PROF(ConvolutionForward, 3); | |||
DEF_PROF(ConvBias, 5); | |||
DEF_PROF(ConvolutionForward); | |||
DEF_PROF(ConvBias); | |||
#undef DEF_PROF | |||
} // namespace test | |||
@@ -279,7 +279,7 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||
benchmarker.set_param(bparam); | |||
if (!algo) { | |||
benchmarker.proxy()->target_algo_info.reset(); | |||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||
} | |||
auto time_in_ms = | |||
benchmarker.execs( | |||
@@ -514,7 +514,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) { | |||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | |||
size_t FW) { | |||
checker.proxy()->target_algo_info.reset(); | |||
checker.proxy()->target_execution_policy.algo.reset(); | |||
checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); | |||
}; | |||
@@ -538,7 +538,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) { | |||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | |||
size_t FW) { | |||
checker.proxy()->target_algo_info.reset(); | |||
checker.proxy()->target_execution_policy.algo.reset(); | |||
checker.execs({{C, 1, 1, FH, FW}, | |||
{N, C, IH - FH + 1, IW - FW + 1}, | |||
{N, C, IH, IW}}); | |||
@@ -564,7 +564,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) { | |||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | |||
size_t FW) { | |||
checker.proxy()->target_algo_info.reset(); | |||
checker.proxy()->target_execution_policy.algo.reset(); | |||
checker.execs({{N, C, IH, IW}, | |||
{N, C, IH - FH + 1, IW - FW + 1}, | |||
{C, 1, 1, FH, FW}}); | |||
@@ -614,7 +614,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | |||
bencher.set_param(param) | |||
@@ -623,10 +623,10 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||
.set_dtype(2, dtype::Float16()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
bencher.set_param(param); | |||
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||
@@ -168,7 +168,7 @@ void benchmark_target_algo( | |||
benchmarker.set_param(param); | |||
if (!algo) { | |||
benchmarker.proxy()->target_algo_info.reset(); | |||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||
} | |||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | |||
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | |||
@@ -327,7 +327,7 @@ void benchmark_target_algo_with_cudnn_tsc( | |||
benchmarker.set_param(param); | |||
if (!algo) { | |||
benchmarker.proxy()->target_algo_info.reset(); | |||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||
} | |||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | |||
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | |||
@@ -8,6 +8,7 @@ | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megdnn/dtype.h" | |||
#include "megdnn/oprs.h" | |||
#include "megdnn/opr_param_defs.h" | |||
#include "test/cuda/fixture.h" | |||
@@ -223,14 +224,19 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) | |||
.set_epsilon(1e-1) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); | |||
checker. | |||
set_rng(0, &rng). | |||
set_rng(1, &rng). | |||
set_epsilon(1e-1). | |||
set_param(arg.param). | |||
exec(TensorLayoutArray{filter, dst, src}); | |||
} | |||
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( | |||
ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16", | |||
{{"MATMUL", {}}}})); | |||
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); | |||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
checker.set_rng(0, &rng) | |||
.set_rng(1, &rng) | |||
.set_epsilon(1e-1) | |||
.set_param(arg.param) | |||
.exec(TensorLayoutArray{filter, dst, src}); | |||
checker.reset_before_exec_callback(); | |||
checker.opr()->execution_policy() = {}; | |||
} | |||
} | |||
@@ -382,32 +388,35 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) { | |||
#if MEGDNN_WITH_BENCHMARK | |||
TEST_F(CUDA, CONV_FWD_BENCHMARK) { | |||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH=1, | |||
size_t SW=1, size_t FH=1, size_t FW=1, size_t PH=0, size_t PW=0, bool fp16io_c32=false) { | |||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, | |||
size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1, | |||
size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) { | |||
auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda()); | |||
benchmarker.set_dtype(0, dtype::Float16()) | |||
.set_dtype(1, dtype::Float16()) | |||
.set_dtype(2, dtype::Float16()); | |||
.set_dtype(1, dtype::Float16()) | |||
.set_dtype(2, dtype::Float16()); | |||
ConvolutionForward::Param param; | |||
param.stride_h = SH; | |||
param.stride_w = SW; | |||
param.pad_h = PH; | |||
param.pad_w = PW; | |||
if (fp16io_c32) { | |||
param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32; | |||
param.compute_mode = | |||
ConvolutionForward::Param::ComputeMode::FLOAT32; | |||
} | |||
benchmarker.set_param(param); | |||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}}; | |||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||
new OprProxy<ConvolutionForward>{true}}; | |||
benchmarker.set_proxy(proxy); | |||
size_t OH = (IH - FH + 2 * PH) / SH + 1; | |||
size_t OW = (IW - FW + 2 * PW) / SW + 1; | |||
auto time = benchmarker.execs({ | |||
{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); | |||
auto time = benchmarker.execs( | |||
{{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); | |||
time /= 1000.0 * 10.0; | |||
auto flo = (double) N * OC * IC * OH * OW * FH * FW * 2; | |||
auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2; | |||
auto flops = flo / time / 1e12; | |||
printf("comp_type %s: ", fp16io_c32 ? "32" : "16"); | |||
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo/1e9, flops); | |||
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops); | |||
}; | |||
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false); | |||
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true); | |||
@@ -415,7 +424,8 @@ TEST_F(CUDA, CONV_FWD_BENCHMARK) { | |||
TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | |||
CUBenchmarker<ConvolutionForward> bench{handle_cuda()}; | |||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}}; | |||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||
new OprProxy<ConvolutionForward>{true}}; | |||
size_t RUNS = 10; | |||
bench.set_proxy(proxy).set_times(RUNS); | |||
@@ -429,7 +439,7 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | |||
param.pad_h = param.pad_w = PH; | |||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
bench.set_param(param); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | |||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | |||
TensorLayout dst; | |||
@@ -440,13 +450,13 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | |||
} | |||
auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS; | |||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_dtype(0, dtype::Float16()) | |||
.set_dtype(1, dtype::Float16()) | |||
.set_dtype(2, dtype::Float16()); | |||
auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS; | |||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_param(param); | |||
auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS; | |||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | |||
@@ -500,7 +510,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||
param.pad_h = param.pad_w = PH; | |||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
bench.set_param(param); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | |||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | |||
TensorLayout dst; | |||
@@ -511,13 +521,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||
} | |||
auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS; | |||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_dtype(0, dtype::Float16()) | |||
.set_dtype(1, dtype::Float16()) | |||
.set_dtype(2, dtype::Float16()); | |||
auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS; | |||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_param(param); | |||
auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS; | |||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | |||
@@ -554,6 +564,62 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||
run(32, 64, 64, 56, 56, 1, 1, 0); | |||
} | |||
TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { | |||
CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()}; | |||
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{ | |||
new OprProxy<ConvolutionBackwardData>{true}}; | |||
size_t RUNS = 10; | |||
bench.set_proxy(proxy).set_times(RUNS); | |||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, | |||
size_t FH, size_t SH, size_t PH) { | |||
bench.set_dtype(0, dtype::BFloat16()) | |||
.set_dtype(1, dtype::BFloat16()) | |||
.set_dtype(2, dtype::BFloat16()); | |||
param::Convolution param; | |||
param.stride_h = param.stride_w = SH; | |||
param.pad_h = param.pad_w = PH; | |||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
bench.set_param(param); | |||
bench.proxy()->target_execution_policy = {}; | |||
TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()}, | |||
filter{{OC, IC, FH, FH}, dtype::BFloat16()}; | |||
TensorLayout dst; | |||
{ | |||
auto&& opr = handle_cuda()->create_operator<Convolution>(); | |||
opr->param() = param; | |||
opr->deduce_layout(src, filter, dst); | |||
} | |||
auto used = bench.execl({filter, dst, src}) / RUNS; | |||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | |||
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), | |||
filter.to_string().c_str(), dst.to_string().c_str()); | |||
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, | |||
(flo / (used * 1e9))); | |||
}; | |||
run(32, 64, 3, 224, 224, 7, 2, 3); | |||
run(32, 128, 128, 28, 28, 3, 1, 1); | |||
run(32, 256, 256, 14, 14, 3, 1, 1); | |||
run(32, 512, 512, 7, 7, 3, 1, 1); | |||
run(32, 64, 64, 56, 56, 3, 1, 1); | |||
run(32, 512, 256, 56, 56, 1, 2, 0); | |||
run(32, 1024, 512, 28, 28, 1, 2, 0); | |||
run(32, 2048, 1024, 14, 14, 1, 2, 0); | |||
run(32, 512, 128, 28, 28, 1, 1, 0); | |||
run(32, 128, 512, 28, 28, 1, 1, 0); | |||
run(32, 1024, 256, 14, 14, 1, 1, 0); | |||
run(32, 256, 1024, 14, 14, 1, 1, 0); | |||
run(32, 2048, 512, 7, 7, 1, 1, 0); | |||
run(32, 512, 2048, 7, 7, 1, 1, 0); | |||
run(32, 256, 64, 56, 56, 1, 1, 0); | |||
run(32, 64, 256, 56, 56, 1, 1, 0); | |||
run(32, 128, 256, 56, 56, 1, 2, 0); | |||
run(32, 256, 512, 28, 28, 1, 2, 0); | |||
run(32, 512, 1024, 14, 14, 1, 2, 0); | |||
run(32, 64, 64, 56, 56, 1, 1, 0); | |||
} | |||
TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||
CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()}; | |||
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | |||
@@ -571,7 +637,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||
param.pad_h = param.pad_w = PH; | |||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
bench.set_param(param); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | |||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | |||
TensorLayout dst; | |||
@@ -582,13 +648,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||
} | |||
auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; | |||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_dtype(0, dtype::Float16()) | |||
.set_dtype(1, dtype::Float16()) | |||
.set_dtype(2, dtype::Float16()); | |||
auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS; | |||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
bench.proxy()->target_algo_info.reset(); | |||
bench.proxy()->target_execution_policy.algo.reset(); | |||
bench.set_param(param); | |||
auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS; | |||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | |||
@@ -630,8 +696,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||
#undef V | |||
#undef V1 | |||
} // namespace test | |||
} // namespace megdnn | |||
} // namespace test | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -778,7 +778,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS; | |||
printf("src=%s, diff=%s, grad=%s, float32: %.2fms " | |||
@@ -856,7 +856,7 @@ TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | |||
; | |||
@@ -915,7 +915,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS; | |||
printf("filter=%s, diff=%s, grad=%s, float32: %.2fms " | |||
@@ -1002,11 +1002,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | |||
bencher_conv.set_param(conv_param); | |||
bencher_conv.proxy()->target_algo_info.reset(); | |||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms_conv = | |||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | |||
@@ -1094,11 +1094,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | |||
bencher_conv.set_param(conv_param); | |||
bencher_conv.proxy()->target_algo_info.reset(); | |||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms_conv = | |||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | |||
@@ -1177,11 +1177,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) { | |||
.set_dtype(2, dtype::Float32()) | |||
.set_rng(0, &rng) | |||
.set_rng(1, &rng); | |||
bencher.proxy()->target_algo_info.reset(); | |||
bencher.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | |||
bencher_conv.set_param(conv_param); | |||
bencher_conv.proxy()->target_algo_info.reset(); | |||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||
auto time_in_ms_conv = | |||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | |||
@@ -10,6 +10,7 @@ | |||
*/ | |||
#include <gtest/gtest.h> | |||
#include "megdnn/basic_types.h" | |||
#include "src/common/utils.h" | |||
#include "test/common/random_state.h" | |||
@@ -21,9 +22,29 @@ class ResetSeedListener : public ::testing::EmptyTestEventListener { | |||
} | |||
}; | |||
megdnn::LogLevel min_log_level; | |||
void set_log_level() { | |||
megdnn::LogLevel level = megdnn::LogLevel::INFO; | |||
auto setting = std::getenv("MEGDNN_LOG_LEVEL"); | |||
if (setting) { | |||
if (!strcmp(setting, "INFO")) { | |||
level = megdnn::LogLevel::INFO; | |||
} else if (!strcmp(setting, "DEBUG")) { | |||
level = megdnn::LogLevel::DEBUG; | |||
} else if (!strcmp(setting, "WARN")) { | |||
level = megdnn::LogLevel::WARN; | |||
} else { | |||
megdnn_assert(!strcmp(setting, "ERROR")); | |||
level = megdnn::LogLevel::ERROR; | |||
} | |||
} | |||
min_log_level = level; | |||
} | |||
void log_handler(megdnn::LogLevel level, const char* file, const char* func, | |||
int line, const char* fmt, va_list ap) { | |||
if (level < megdnn::LogLevel::ERROR) { | |||
if (level < min_log_level) { | |||
return; | |||
} | |||
char msg[1024]; | |||
@@ -39,6 +60,7 @@ void log_handler(megdnn::LogLevel level, const char* file, const char* func, | |||
extern "C" int gtest_main(int argc, char** argv) { | |||
::megdnn::set_log_handler(log_handler); | |||
set_log_level(); | |||
ResetSeedListener listener; | |||
auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); | |||
::testing::InitGoogleTest(&argc, argv); | |||
@@ -450,6 +450,7 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) { | |||
} | |||
} | |||
#endif | |||
#endif | |||
} // namespace test | |||
@@ -27,6 +27,7 @@ | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/test/helper.h" | |||
#include "megdnn/oprs/base.h" | |||
#include <atomic> | |||
#include <chrono> | |||
@@ -1924,19 +1925,19 @@ TEST(TestGraph, NaiveRecord2NCHW44) { | |||
namespace { | |||
template <typename DnnOp, typename... Args> | |||
typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||
typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo( | |||
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | |||
Args&& ...args) { | |||
if (found.valid()) { | |||
if (found.val()) { | |||
return dnn_op->execution_policy().algo; | |||
return dnn_op->execution_policy(); | |||
} else { | |||
return {}; | |||
} | |||
} | |||
for (auto&& algo : dnn_op->get_all_algorithms_info( | |||
std::forward<Args>(args)...)) { | |||
dnn_op->execution_policy().algo = algo; | |||
dnn_op->execution_policy().algo = algo.desc; | |||
auto layouts = dnn_op->deduce_preprocessed_filter_layout( | |||
std::forward<Args>(args)...); | |||
if (layouts.empty()) continue; | |||
@@ -1949,7 +1950,7 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||
} | |||
if (valid) { | |||
found.emplace(true); | |||
return algo; | |||
return {algo.desc, {}}; | |||
} | |||
} | |||
found.emplace(false); | |||
@@ -1958,19 +1959,19 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||
} | |||
template <typename DnnOp, typename... Args> | |||
typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( | |||
typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo( | |||
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | |||
Args&& ...args) { | |||
if (found.valid()) { | |||
if (found.val()) { | |||
return dnn_op->execution_policy().algo; | |||
return dnn_op->execution_policy(); | |||
} else { | |||
return {}; | |||
} | |||
} | |||
for (auto&& algo : dnn_op->get_all_algorithms_info( | |||
std::forward<Args>(args)...)) { | |||
dnn_op->execution_policy().algo = algo; | |||
dnn_op->execution_policy().algo = algo.desc; | |||
auto layouts = dnn_op->deduce_preprocessed_filter_layout( | |||
std::forward<Args>(args)...); | |||
if (layouts.size() <= 1) | |||
@@ -1981,7 +1982,7 @@ typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( | |||
} | |||
if (valid) { | |||
found.emplace(true); | |||
return algo; | |||
return {algo.desc, {}}; | |||
} | |||
} | |||
found.emplace(false); | |||
@@ -11,6 +11,7 @@ | |||
*/ | |||
#include "megbrain/opr/search_policy/algo_chooser.h" | |||
#include "megbrain/opr/internal/megdnn_opr_wrapper.h" | |||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||
#include "megbrain/opr/search_policy/profiler.h" | |||
@@ -21,6 +22,7 @@ | |||
//! TODO: here has to be know some megdnn::opr when there is produced midout.h | |||
//! fix it if there is another graceful way. | |||
#include "megdnn/oprs.h" | |||
#include "megdnn/oprs/base.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megbrain_opr_algo_chooser) | |||
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { | |||
@@ -29,6 +31,8 @@ MIDOUT_DECL(megbrain_opr_algo_chooser) | |||
MIDOUT_END(); | |||
using mgb::opr::intl::WorkspaceLimitGetter; | |||
using namespace megdnn; | |||
using namespace mgb; | |||
#define APPLY(statement, ...) \ | |||
mgb::apply([&](const auto&... args) { return statement; }, \ | |||
@@ -37,7 +41,7 @@ using mgb::opr::intl::WorkspaceLimitGetter; | |||
// timeout delta to be added with fastest known algorithm for new algos | |||
constexpr double TIMEOUT_TOLERANCE = 2; | |||
#define CACHE_KEY_VERSION "v3" | |||
#define CACHE_KEY_VERSION "v4" | |||
namespace { | |||
template <typename Opr> | |||
@@ -48,44 +52,191 @@ std::string profile_name(Opr* opr) { | |||
ret.append(opr->get_algorithm_set_name()); | |||
return ret; | |||
} | |||
template <typename Opr> | |||
std::string format_fixlayouts( | |||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | |||
size_t arity_in, size_t arity_out) { | |||
std::string ret; | |||
ret.append(": tensor layouts("); | |||
for (size_t i = 0; i < arity_in; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i].to_string() + " "); | |||
ret.append(layouts[i].dtype.name()); | |||
} | |||
ret.append(") -> ("); | |||
for (size_t i = 0; i < arity_out; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i + arity_in].to_string() + " "); | |||
ret.append(layouts[i + arity_in].dtype.name()); | |||
} | |||
return ret; | |||
} | |||
///////////////// OprTypeTrait ///////////////////////////// | |||
template <megdnn::Algorithm::OprType> | |||
struct OprFromOprTypeTrait; | |||
template <typename Opr> | |||
struct OprTypeFromOprTrait; | |||
#define cb(_opr_type, _opr) \ | |||
template <> \ | |||
struct OprFromOprTypeTrait<megdnn::Algorithm::OprType::_opr_type> { \ | |||
using Opr = megdnn::_opr; \ | |||
}; \ | |||
template <> \ | |||
struct OprTypeFromOprTrait<megdnn::_opr> { \ | |||
constexpr static megdnn::Algorithm::OprType opr_type = \ | |||
megdnn::Algorithm::OprType::_opr_type; \ | |||
} | |||
cb(MATRIX_MUL_FORWARD, MatrixMulForward); | |||
cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); | |||
cb(CONVOLUTION_FORWARD, ConvolutionForward); | |||
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); | |||
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); | |||
cb(CONVOLUTION3D_FORWARD, Convolution3DForward); | |||
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); | |||
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); | |||
cb(LOCAL_SHARE_FORWARD, LocalShareForward); | |||
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); | |||
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); | |||
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); | |||
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); | |||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | |||
cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | |||
cb(CONVBIAS_FORWARD, ConvBiasForward); | |||
#undef cb | |||
// clang-format off | |||
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ | |||
cb(MATRIX_MUL_FORWARD, stmt) \ | |||
cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ | |||
cb(CONVOLUTION_FORWARD, stmt) \ | |||
cb(CONVOLUTION_BACKWARD_DATA, stmt) \ | |||
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ | |||
cb(CONVOLUTION3D_FORWARD, stmt) \ | |||
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ | |||
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ | |||
cb(LOCAL_SHARE_FORWARD, stmt) \ | |||
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ | |||
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ | |||
cb(DEFORMABLE_CONV_FORWARD, stmt) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | |||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | |||
cb(BATCH_CONV_FORWARD, stmt) \ | |||
cb(CONVBIAS_FORWARD, stmt) | |||
// clang-format on | |||
#define _OPR_TYPE_CASE(_opr_type, _stmt) \ | |||
case Algorithm::OprType::_opr_type: { \ | |||
using _Opr = typename OprFromOprTypeTrait< \ | |||
Algorithm::OprType::_opr_type>::Opr; \ | |||
_stmt; \ | |||
break; \ | |||
} | |||
#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ | |||
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ | |||
_item_idx++) { \ | |||
auto&& _item = _search_items[_item_idx]; \ | |||
switch (_item.opr_type) { \ | |||
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ | |||
default: \ | |||
mgb_throw(MegBrainError, "unknown opr_type"); \ | |||
} \ | |||
} | |||
template <typename Opr> | |||
TensorLayoutArray to_layout_array( | |||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts) { | |||
TensorLayoutArray ret; | |||
for (auto&& layout : layouts) { | |||
ret.push_back(layout); | |||
} | |||
return ret; | |||
} | |||
template <typename Opr> | |||
typename opr::AlgoChooser<Opr>::FixedTensorLayouts to_fixed_layouts( | |||
const TensorLayoutArray& layouts) { | |||
typename opr::AlgoChooser<Opr>::FixedTensorLayouts ret; | |||
mgb_assert(ret.size() == layouts.size()); | |||
size_t idx = 0; | |||
for (auto&& layout : layouts) { | |||
ret[idx++] = layout; | |||
} | |||
return ret; | |||
} | |||
} // namespace | |||
namespace mgb { | |||
namespace opr { | |||
template <typename Opr> | |||
AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||
ExeContext& ctx, bool enable_update) { | |||
AlgoChooserProfileCache cache(ctx.mgb_opr()->comp_node(), | |||
profile_name(ctx.megdnn_opr()).c_str()); | |||
TensorLayoutArray origin_layouts = ctx.layouts(); | |||
typename Opr::Param origin_param = ctx.mgb_opr()->param(); | |||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||
origin_layouts.size(), &origin_param, | |||
sizeof(origin_param)}; | |||
{ | |||
auto&& rst = cache.get(cache_key); | |||
if (rst.valid()) | |||
return rst.val(); | |||
std::vector<megdnn::Algorithm::SearchItem> | |||
AlgoChooser<Opr>::flatten_search_space(const ExeContext& ctx) { | |||
std::vector<megdnn::Algorithm::SearchItem> ret; | |||
for (auto algo_info : ctx.get_all_candidates()) { | |||
megdnn::Algorithm* algo = ctx.get_algorithm_from_desc(algo_info.desc); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<megdnn::Algorithm::SearchItem>&& sub_items = | |||
algo->get_subopr_list(to_layout_array<Opr>(ctx.layouts()), | |||
ctx.megdnn_opr()); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); | |||
megdnn_opr->param() = | |||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||
_item.param); | |||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||
_item.param, ctx.mgb_opr(), ctx.comp_node(), | |||
ctx.execution_policy(), ctx.allow_weight_preprocess()); | |||
auto space = AlgoChooser<_Opr>::flatten_search_space(sub_ctx); | |||
ret.insert(ret.end(), space.begin(), space.end()); | |||
}); | |||
} | |||
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, ctx.param(), | |||
to_layout_array<Opr>(ctx.layouts())}); | |||
return ret; | |||
} | |||
template <typename Opr> | |||
void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) { | |||
if (ctx.get_profile_result_from_cache(require_reproducible).valid()) | |||
return; | |||
AlgoChooserProfileCache::Result prof_rst; | |||
if (!enable_update) | |||
return prof_rst; | |||
std::string str_on_inp_shape = ssprintf( | |||
"on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(), | |||
ctx.layouts()[1].to_string().c_str()); | |||
double cur_timeout = 0; | |||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||
ctx.owner_graph(), ctx.comp_node(), | |||
ctx.execution_policy().workspace_limit); | |||
RealTimer timer; | |||
for (auto algo : ctx.get_all_candidates_with_workspace_limit()) { | |||
for (auto algo : ctx.get_all_candidates()) { | |||
Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst; | |||
std::string msg = ssprintf("profiling %s algorithm %s %s", | |||
ctx.mgb_opr()->dyn_typeinfo()->name, | |||
algo.name.c_str(), str_on_inp_shape.c_str()); | |||
ImplExecutionPolicy policy; | |||
policy.algo = algo.desc; | |||
ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||
if (ctx.get_workspace_size_bytes(policy) >= workspace_limit) | |||
continue; | |||
timer.reset(); | |||
MGB_TRY { cur_rst = ctx.profile_single_algo(algo, cur_timeout); } | |||
MGB_TRY { cur_rst = ctx.profile_single_algo(policy, cur_timeout); } | |||
MGB_CATCH(std::exception & exc, { | |||
mgb_log_warn("caught exception during %s: %s", msg.c_str(), | |||
exc.what()); | |||
@@ -114,120 +265,100 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||
mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s", | |||
str_on_inp_shape.c_str()); | |||
FixedTensorLayouts origin_layouts = ctx.layouts(); | |||
typename Opr::Param origin_param = ctx.megdnn_opr()->param(); | |||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||
origin_layouts.size(), &origin_param, | |||
sizeof(origin_param)}; | |||
AlgoChooserProfileCache cache(ctx.comp_node(), | |||
profile_name(ctx.megdnn_opr()).c_str()); | |||
cache.put(cache_key, prof_rst); | |||
return prof_rst; | |||
} | |||
template <typename Opr> | |||
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::choose_by_profile( | |||
ExeContext& ctx, bool require_reproducible, bool enable_update) { | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy | |||
AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible, | |||
bool enable_update) { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile"))) | |||
auto opr = ctx.mgb_opr(); | |||
if (opr->owner_graph()->options().no_profiling_on_shape_change) { | |||
auto algo = ctx.megdnn_opr()->execution_policy().algo; | |||
if (algo.valid()) | |||
return algo; | |||
if (ctx.owner_graph()->options().no_profiling_on_shape_change) { | |||
auto policy = ctx.megdnn_opr()->execution_policy(); | |||
if (policy.algo.valid()) | |||
return policy; | |||
} | |||
std::unordered_map<std::string, ImplAlgo> algo_map; | |||
for (auto i : ctx.get_all_candidates()) { | |||
auto ins = algo_map.emplace(i.name.c_str(), i); | |||
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); | |||
if (enable_update) { | |||
auto&& search_items = flatten_search_space(ctx); | |||
FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); | |||
megdnn_opr->param() = | |||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||
_item.param); | |||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||
_item.param, ctx.mgb_opr(), ctx.comp_node(), | |||
ctx.execution_policy(), ctx.allow_weight_preprocess()); | |||
AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible); | |||
}); | |||
} | |||
auto&& prof = get_profile_result(ctx, enable_update); | |||
if (prof.empty()) | |||
return {}; | |||
for (auto&& i : prof) { | |||
if ((!require_reproducible || i.reproducible)) { | |||
auto iter = algo_map.find(i.algo); | |||
mgb_assert(iter != algo_map.end(), | |||
"algorithm %s exists in " | |||
"profiling result but not in algo_map; please " | |||
"report this " | |||
"bug; opr: %s{%s}, shapes: %s %s %s", | |||
i.algo.c_str(), | |||
ctx.mgb_opr()->cname(), | |||
ctx.mgb_opr()->dyn_typeinfo()->name, | |||
ctx.layouts()[0].TensorShape::to_string().c_str(), | |||
ctx.layouts()[1].TensorShape::to_string().c_str(), | |||
ctx.layouts()[2].TensorShape::to_string().c_str()); | |||
return iter->second; | |||
} | |||
} | |||
mgb_log_error( | |||
"Workspace requirement (%zu) could not be satisfied. Abort now " | |||
"to " | |||
"avoid further problems", | |||
WorkspaceLimitGetter::get_workspace_limit( | |||
opr->owner_graph(), opr->comp_node(), | |||
opr->execution_policy().workspace_limit)); | |||
mgb_trap(); | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy policy; | |||
ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||
return policy; | |||
MIDOUT_E | |||
} | |||
template <typename Opr> | |||
size_t AlgoChooser<Opr>::setup_algo(const TensorLayoutArray& layouts, | |||
size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||
Opr* megdnn_opr, const MGBOpr* mgb_opr, | |||
bool allow_weight_preprocess) { | |||
if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { | |||
return 0; | |||
} | |||
ImplAlgo algo = {}; | |||
ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess); | |||
std::string param_str; | |||
Algorithm::serialize_write_pod(megdnn_opr->param(), param_str); | |||
ExeContext ctx(layouts, megdnn_opr, param_str, mgb_opr, | |||
mgb_opr->comp_node(), mgb_opr->execution_policy(), | |||
allow_weight_preprocess); | |||
ImplExecutionPolicy policy; | |||
if (auto algo_choose_hook = mgb_opr->algo_chooser()) { | |||
algo = algo_choose_hook(mgb_opr); | |||
policy = algo_choose_hook(mgb_opr); | |||
} | |||
if (!algo.valid()) { | |||
algo = get_algo(ctx); | |||
if (!policy.algo.valid()) { | |||
policy = get_policy(ctx); | |||
} | |||
size_t workspace = ctx.get_workspace_size_bytes(algo); | |||
size_t workspace = ctx.get_workspace_size_bytes(policy); | |||
std::string ret; | |||
ret.append(mgb_opr->dyn_typeinfo()->name); | |||
ret.append(": tensor layouts("); | |||
for (size_t i = 0; i < arity_in; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i].to_string() + " "); | |||
ret.append(layouts[i].dtype.name()); | |||
} | |||
ret.append(") -> ("); | |||
for (size_t i = 0; i < arity_out; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i + arity_in].to_string() + " "); | |||
ret.append(layouts[i + arity_in].dtype.name()); | |||
} | |||
ret.append("): algo=" + algo.name); | |||
ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out); | |||
Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
ret.append("): algo=" + std::string(palgo->name())); | |||
ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d", | |||
workspace / (1024 * 1024.0), algo.is_reproducible)); | |||
workspace / (1024 * 1024.0), palgo->is_reproducible())); | |||
mgb_log_debug("%s", ret.c_str()); | |||
megdnn_opr->execution_policy() = {algo}; | |||
megdnn_opr->execution_policy() = policy; | |||
return workspace; | |||
} | |||
template <typename Opr> | |||
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo( | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy( | |||
ExeContext& ctx) { | |||
using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||
MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE); | |||
switch (ctx.mgb_opr()->execution_policy().strategy) { | |||
switch (ctx.execution_policy().strategy) { | |||
case S::HEURISTIC: | |||
return ctx.choose_by_heuristic(); | |||
case S::HEURISTIC_REPRODUCIBLE: | |||
return ctx.choose_by_heuristic(true); | |||
case S::PROFILE_HEURISTIC: { | |||
ImplAlgo algo = choose_by_profile(ctx, false, false); | |||
if (!algo.valid()) | |||
algo = ctx.choose_by_heuristic(); | |||
return algo; | |||
ImplExecutionPolicy policy = choose_by_profile(ctx, false, false); | |||
if (!policy.algo.valid()) | |||
policy = ctx.choose_by_heuristic(); | |||
return policy; | |||
} | |||
#if MGB_ENABLE_FASTRUN | |||
case S::PROFILE: | |||
@@ -241,16 +372,17 @@ typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo( | |||
} | |||
#define INST(Opr) \ | |||
template AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||
AlgoChooser<megdnn::Opr>::get_algo(ExeContext& ctx); \ | |||
template AlgoChooserProfileCache::Result \ | |||
AlgoChooser<megdnn::Opr>::get_profile_result(ExeContext& ctx, \ | |||
bool enable_update); \ | |||
template AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||
AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx); \ | |||
template void AlgoChooser<megdnn::Opr>::profile( \ | |||
ExeContext& ctx, bool require_reproducible); \ | |||
template std::vector<megdnn::Algorithm::SearchItem> \ | |||
AlgoChooser<megdnn::Opr>::flatten_search_space(const ExeContext& ctx); \ | |||
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||
AlgoChooser<megdnn::Opr>::choose_by_profile( \ | |||
ExeContext& ctx, bool require_reproducible, bool enable_update); \ | |||
template size_t AlgoChooser<megdnn::Opr>::setup_algo( \ | |||
const TensorLayoutArray& layouts, megdnn::Opr* megdnn_opr, \ | |||
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | |||
const MGBOpr* mgb_opr, bool allow_weight_preprocess); | |||
MGB_FOREACH_FASTRUN_OPR(INST) | |||
@@ -258,17 +390,109 @@ MGB_FOREACH_FASTRUN_OPR(INST) | |||
#undef INST | |||
//////////////////////////////// ExeContext ///////////////////////////// | |||
template <typename Opr> | |||
AlgoChooser<Opr>::ExeContext::ExeContext( | |||
const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, | |||
const CompNode& cn, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
bool allow_weight_preprocess) | |||
: m_layouts{layouts}, | |||
m_megdnn_opr{megdnn_opr}, | |||
m_param{param_str}, | |||
m_base_mgb_opr{mgb_opr}, | |||
m_cn{cn}, | |||
m_execution_policy{execution_policy}, | |||
m_allow_weight_preprocess{allow_weight_preprocess} { | |||
mgb_assert(m_layouts.size() == layouts.size()); | |||
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | |||
std::tuple_size<FixedTensorLayouts>::value == 5 || | |||
std::tuple_size<FixedTensorLayouts>::value == 8, | |||
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " | |||
"deformable conv)"); | |||
} | |||
template <typename Opr> | |||
typename AlgoChooser<Opr>::ImplAlgo | |||
AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache( | |||
bool require_reproducible) const { | |||
MIDOUT_B(Opr, | |||
midout_iv(MGB_HASH_STR( | |||
"AlgoChooser::ExeContext::get_profile_result_from_cache"))) | |||
AlgoChooserProfileCache cache(m_cn, | |||
profile_name(m_megdnn_opr).c_str()); | |||
typename Opr::Param origin_param = m_megdnn_opr->param(); | |||
AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(), | |||
&origin_param, sizeof(origin_param)}; | |||
auto&& rst = cache.get(cache_key); | |||
if (!rst.valid()) | |||
return {}; | |||
auto&& prof = rst.val(); | |||
std::unordered_map<std::string, ImplAlgo> algo_map; | |||
for (auto i : get_all_candidates()) { | |||
auto ins = algo_map.emplace(i.name.c_str(), i); | |||
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); | |||
} | |||
if (prof.empty()) | |||
return {}; | |||
for (auto&& i : prof) { | |||
if ((!require_reproducible || i.reproducible)) { | |||
auto iter = algo_map.find(i.algo); | |||
mgb_assert(iter != algo_map.end(), | |||
"algorithm %s exists in " | |||
"profiling result but not in algo_map; please " | |||
"report this " | |||
"bug; opr: %s{%s}, layouts: %s ", | |||
i.algo.c_str(), m_base_mgb_opr->cname(), | |||
m_base_mgb_opr->dyn_typeinfo()->name, | |||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out) | |||
.c_str()); | |||
return iter->second; | |||
} | |||
} | |||
mgb_log_error( | |||
"Workspace requirement (%zu) could not be satisfied. Abort now " | |||
"to " | |||
"avoid further problems", | |||
WorkspaceLimitGetter::get_workspace_limit( | |||
m_base_mgb_opr->owner_graph(), m_cn, | |||
m_execution_policy.workspace_limit)); | |||
mgb_trap(); | |||
MIDOUT_E | |||
} | |||
template <typename Opr> | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy | |||
AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const { | |||
auto opr = m_mgb_opr; | |||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||
opr->owner_graph(), opr->comp_node(), | |||
opr->execution_policy().workspace_limit); | |||
return APPLY(m_megdnn_opr->get_algorithm_info_heuristic( | |||
args..., workspace_limit, reproducible), | |||
m_layouts); | |||
owner_graph(), m_cn, m_execution_policy.workspace_limit); | |||
ImplExecutionPolicy policy; | |||
policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic( | |||
args..., workspace_limit, reproducible), | |||
m_layouts).desc; | |||
Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||
to_layout_array<Opr>(m_layouts), m_megdnn_opr); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||
megdnn_opr->param() = | |||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||
_item.param); | |||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||
_item.param, m_base_mgb_opr, m_cn, m_execution_policy, | |||
m_allow_weight_preprocess); | |||
policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible)); | |||
}); | |||
return policy; | |||
} | |||
template <typename Opr> | |||
@@ -279,40 +503,58 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const { | |||
APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts); | |||
bool found = false; | |||
for (size_t i = 0; i < ret.size(); ++i) { | |||
if (ret[i] == heu) { | |||
if (ret[i].desc == heu.algo) { | |||
found = true; | |||
std::swap(ret[i], ret[0]); | |||
break; | |||
} | |||
} | |||
Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(heu.algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
mgb_assert(found, | |||
"algo %s got by heuristic not found in " | |||
"candidate list", | |||
heu.name.c_str()); | |||
palgo->name()); | |||
return std::move(ret); | |||
} | |||
template <typename Opr> | |||
std::vector<typename AlgoChooser<Opr>::ImplAlgo> | |||
AlgoChooser<Opr>::ExeContext::get_all_candidates_with_workspace_limit() const { | |||
auto&& all_algos = get_all_candidates(); | |||
auto opr = m_mgb_opr; | |||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||
opr->owner_graph(), opr->comp_node(), | |||
opr->execution_policy().workspace_limit); | |||
std::vector<ImplAlgo> ret; | |||
for (auto&& algo : all_algos) { | |||
if (get_workspace_size_bytes(algo) <= workspace_limit) { | |||
ret.push_back(algo); | |||
} | |||
void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache( | |||
bool require_reproducible, | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const { | |||
if (!policy.algo.valid()) { | |||
policy.algo = get_profile_result_from_cache(require_reproducible).desc; | |||
mgb_assert(policy.algo.valid(), | |||
"No cache found, maybe some error occured"); | |||
} | |||
return ret; | |||
Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||
to_layout_array<Opr>(m_layouts), m_megdnn_opr); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||
megdnn_opr->param() = | |||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||
_item.param); | |||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||
_item.param, m_base_mgb_opr, m_cn, m_execution_policy, | |||
m_allow_weight_preprocess); | |||
policy.sub_policy.push_back({}); | |||
sub_ctx.construct_execution_policy_from_cache(require_reproducible, | |||
policy.sub_policy.back()); | |||
}); | |||
return; | |||
} | |||
template <typename Opr> | |||
size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( | |||
ImplAlgo algo) const { | |||
m_megdnn_opr->execution_policy() = {algo}; | |||
const ImplExecutionPolicy& policy) const { | |||
m_megdnn_opr->execution_policy() = policy; | |||
size_t result; | |||
if_constexpr<opr_supports_preprocess<Opr>()>( | |||
[&](auto _) { | |||
@@ -336,17 +578,13 @@ size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( | |||
template <typename Opr> | |||
Maybe<AlgoChooserProfileCache::ResultEntry> | |||
AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, | |||
double& timeout) const { | |||
AlgoChooser<Opr>::ExeContext::profile_single_algo( | |||
const ImplExecutionPolicy& policy, double& timeout) const { | |||
typename TimedProfiler<Opr>::Param param; | |||
auto name = algo.name.c_str(); | |||
// force check copy size <= dest len-1 from gcc8 for safe | |||
auto len = sizeof(param.algo_name); | |||
strncpy(param.algo_name, name, len - 1); | |||
param.algo_name[len - 1] = '\0'; | |||
mgb_assert(!param.algo_name[sizeof(param.algo_name) - 2], | |||
"algo name too long: %s; len=%zu", name, strlen(name)); | |||
param.workspace = get_workspace_size_bytes(algo); | |||
param.execution_policy = | |||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy); | |||
param.workspace = get_workspace_size_bytes(policy); | |||
for (int i = 0; i < arity; ++i) { | |||
auto&& src = m_layouts[i]; | |||
mgb_assert(src.format.is_default() && | |||
@@ -357,23 +595,25 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, | |||
src.to_string().c_str()); | |||
param.dtypes[i] = src.dtype.enumv(); | |||
} | |||
param.comp_node_loc = m_mgb_opr->output(0)->comp_node().locator(); | |||
param.comp_node_loc = m_cn.locator(); | |||
mgb_assert(param.shapes.size() == m_layouts.size()); | |||
for (size_t i = 0; i < param.shapes.size(); ++i) | |||
param.shapes[i] = m_layouts[i]; | |||
param.opr_param = m_megdnn_opr->param(); | |||
param.allow_weight_preprocess = m_allow_weight_preprocess; | |||
Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
auto rst = TimedProfiler<Opr>::profile(param, timeout); | |||
// MIOpen conv profiles all available algos when a specfic shape is | |||
// provided for the first time, which probably adds to the result time. | |||
// Therefore, a second profile execution is needed. | |||
if (strncmp(name, "MIOpen", 6) == 0) | |||
if (strncmp(palgo->name(), "MIOpen", 6) == 0) | |||
rst = TimedProfiler<Opr>::profile(param, timeout); | |||
if (!rst.valid()) | |||
return None; | |||
return AlgoChooserProfileCache::ResultEntry{ | |||
algo.name.c_str(), algo.is_reproducible, rst.val().time, | |||
palgo->name(), palgo->is_reproducible(), rst.val().time, | |||
param.workspace}; | |||
} | |||
@@ -414,21 +654,34 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const { | |||
} | |||
#define INST(Opr) \ | |||
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||
template AlgoChooser<megdnn::Opr>::ExeContext::ExeContext( \ | |||
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | |||
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, \ | |||
const CompNode& cn, \ | |||
const megdnn::param::ExecutionPolicy& execution_policy, \ | |||
bool allow_weight_preprocess); \ | |||
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||
AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \ | |||
bool reproducible) const; \ | |||
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||
AlgoChooser<megdnn::Opr>::ExeContext::get_profile_result_from_cache( \ | |||
bool require_reproducible) const; \ | |||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | |||
AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \ | |||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | |||
AlgoChooser<megdnn::Opr>::ExeContext:: \ | |||
get_all_candidates_with_workspace_limit() const; \ | |||
template size_t \ | |||
AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \ | |||
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo) const; \ | |||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
policy) const; \ | |||
template void AlgoChooser<megdnn::Opr>::ExeContext:: \ | |||
construct_execution_policy_from_cache( \ | |||
bool require_reproducible, \ | |||
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
policy) const; \ | |||
template Maybe<AlgoChooserProfileCache::ResultEntry> \ | |||
AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \ | |||
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo, double& timeout) \ | |||
const; \ | |||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
policy, \ | |||
double& timeout) const; | |||
MGB_FOREACH_FASTRUN_OPR(INST) | |||
@@ -14,6 +14,8 @@ | |||
#include "../internal/invoke.h" | |||
#include "../internal/megdnn_opr_wrapper.inl" | |||
#include "megdnn/handle.h" | |||
#include "megdnn/oprs/base.h" | |||
#if MGB_ROCM | |||
#include "hcc_detail/hcc_defs_prologue.h" | |||
@@ -32,12 +34,96 @@ MIDOUT_DECL(megbrain_opr_profile) | |||
} \ | |||
MIDOUT_END(); | |||
namespace { | |||
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { | |||
std::string ret; | |||
//! serialize AlgorithmDesc | |||
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); | |||
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); | |||
uint32_t param_size = policy.algo.param.size(); | |||
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret); | |||
ret += policy.algo.param; | |||
//! serialize sub_policy | |||
uint32_t size = policy.sub_policy.size(); | |||
megdnn::Algorithm::serialize_write_pod(size, ret); | |||
for (auto&& sub : policy.sub_policy) { | |||
ret += serialize_policy(sub); | |||
} | |||
return ret; | |||
} | |||
megdnn::ExecutionPolicy deserialize_policy(const char* buf, uint32_t size, | |||
uint32_t& offset) { | |||
megdnn::ExecutionPolicy ret; | |||
#define cb(_val, _type) \ | |||
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ | |||
offset += sizeof(_val) | |||
cb(ret.algo.handle_type, megdnn::Handle::HandleType); | |||
cb(ret.algo.type, uint32_t); | |||
uint32_t param_size = 0; | |||
cb(param_size, uint32_t); | |||
if (param_size > 0) { | |||
ret.algo.param = std::string(buf + offset, param_size); | |||
offset += param_size; | |||
} | |||
uint32_t nr_policy = 0; | |||
cb(nr_policy, uint32_t); | |||
#undef cb | |||
for (uint32_t i = 0; i < nr_policy; i++) { | |||
ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); | |||
} | |||
return ret; | |||
} | |||
} | |||
namespace mgb { | |||
namespace opr { | |||
#define APPLY(statement, ...) \ | |||
mgb::apply([&](const auto&... args) { return statement; }, \ | |||
std::tuple_cat(__VA_ARGS__)) | |||
////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// | |||
template <typename Opr> | |||
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob | |||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize( | |||
const megdnn::ExecutionPolicy& policy) { | |||
ExecutionPolicyBlob ret; | |||
std::string serialize_bin = serialize_policy(policy); | |||
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); | |||
memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); | |||
ret.size = serialize_bin.size(); | |||
return ret; | |||
} | |||
template <typename Opr> | |||
megdnn::ExecutionPolicy | |||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() const { | |||
uint32_t offset = 0; | |||
auto&& ret = deserialize_policy(data, size, offset); | |||
mgb_assert(offset == size); | |||
return std::move(ret); | |||
} | |||
#define INST(Opr) \ | |||
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \ | |||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \ | |||
const megdnn::ExecutionPolicy& policy); \ | |||
template megdnn::ExecutionPolicy \ | |||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() \ | |||
const; | |||
MGB_FOREACH_FASTRUN_OPR(INST) | |||
#undef INST | |||
////////////////// TimedProfiler ////////////////////////////// | |||
template <typename Opr> | |||
const double TimedProfiler<Opr>::timeout_setting = | |||
TimedProfiler<Opr>::init_timeout_setting(); | |||
@@ -99,18 +185,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
} | |||
megdnn_opr->param() = param.opr_param; | |||
{ | |||
typename Opr::AlgorithmInfo algo; | |||
for (auto i : | |||
APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) { | |||
if (!strcmp(i.name.c_str(), param.algo_name)) { | |||
algo = i; | |||
break; | |||
} | |||
} | |||
mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name); | |||
megdnn_opr->execution_policy() = {algo}; | |||
} | |||
megdnn_opr->execution_policy() = param.execution_policy.deserialize(); | |||
// Allocate preprocessed weight buffers. | |||
TensorLayoutArray preprocessed_layout; | |||
@@ -222,13 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
}); | |||
ev_end->record(); | |||
megdnn::Algorithm* algo = megdnn_opr->get_algorithm_from_desc( | |||
megdnn_opr->execution_policy().algo); | |||
mgb_assert(algo); | |||
double next_report_time = 0.5; | |||
while (!ev_end->finished()) { | |||
if (timer.get_secs() >= next_report_time) { | |||
mgb_log_warn( | |||
"profiling conv algo %s already took %.3f/%.3f secs" | |||
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | |||
param.algo_name, timer.get_secs(), param.actual_timeout); | |||
algo->name(), timer.get_secs(), param.actual_timeout); | |||
next_report_time = timer.get_secs() + 1; | |||
} | |||
using namespace std::literals; | |||
@@ -46,7 +46,7 @@ private: | |||
static bool check_layout(const TensorLayout& layout, int transpose); | |||
//! store the policy of all transpose situations | |||
megdnn::MatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; | |||
megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; | |||
}; | |||
/*! | |||
@@ -76,7 +76,7 @@ private: | |||
static bool check_layout(const TensorLayout& layout, bool transpose); | |||
//! store the policy of all transpose situations | |||
megdnn::BatchedMatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; | |||
megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; | |||
}; | |||
/*! | |||
@@ -12,9 +12,14 @@ | |||
#pragma once | |||
#include <memory> | |||
#include "megbrain/graph/cg.h" | |||
#include "megbrain/graph/operator_node.h" | |||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||
#include "megbrain/opr/search_policy/profiler.h" | |||
#include "megbrain/opr/dnn/convolution.h" | |||
#include "megbrain/opr/blas.h" | |||
#include "megdnn/oprs/base.h" | |||
template <class MegDNNOpr> | |||
struct MegDNNOpr2MGBOpr; | |||
@@ -49,52 +54,64 @@ class AlgoChooser { | |||
static constexpr int arity = OprArityTrait<Opr>::arity; | |||
using ImplAlgo = typename Opr::AlgorithmInfo; | |||
using ImplExecutionPolicy = megdnn::ExecutionPolicy; | |||
using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; | |||
using TensorLayoutArray = std::array<TensorLayout, arity>; | |||
public: | |||
using FixedTensorLayouts = std::array<TensorLayout, arity>; | |||
class ExeContext { | |||
const TensorLayoutArray& m_layouts; | |||
FixedTensorLayouts m_layouts; | |||
Opr* m_megdnn_opr; | |||
const MGBOpr* m_mgb_opr; | |||
std::string m_param; | |||
const cg::OperatorNodeBase* m_base_mgb_opr; | |||
CompNode m_cn; | |||
megdnn::param::ExecutionPolicy m_execution_policy; | |||
bool m_allow_weight_preprocess; | |||
public: | |||
ExeContext(const TensorLayoutArray& layouts, Opr* megdnn_opr, | |||
const MGBOpr* mgb_opr, bool allow_weight_preprocess) | |||
: m_layouts{layouts}, | |||
m_megdnn_opr{megdnn_opr}, | |||
m_mgb_opr{mgb_opr}, | |||
m_allow_weight_preprocess{allow_weight_preprocess} { | |||
mgb_assert(m_layouts.size() == layouts.size()); | |||
static_assert( | |||
std::tuple_size<TensorLayoutArray>::value == 3 || | |||
std::tuple_size<TensorLayoutArray>::value == 5 || | |||
std::tuple_size<TensorLayoutArray>::value == 8, | |||
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " | |||
"deformable conv)"); | |||
} | |||
ExeContext(const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||
const std::string& param_str, | |||
const cg::OperatorNodeBase* mgb_opr, const CompNode& cn, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
bool allow_weight_preprocess); | |||
Opr* megdnn_opr() const { return m_megdnn_opr; } | |||
const MGBOpr* mgb_opr() const { return m_mgb_opr; } | |||
const TensorLayout& inp_layout(size_t idx) const { | |||
return m_layouts[idx]; | |||
} | |||
const TensorLayoutArray& layouts() const { return m_layouts; } | |||
cg::ComputingGraph* owner_graph() const { | |||
return m_base_mgb_opr->owner_graph(); | |||
} | |||
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | |||
const megdnn::param::ExecutionPolicy& execution_policy() const { | |||
return m_execution_policy; | |||
} | |||
CompNode comp_node() const { return m_cn; } | |||
const std::string& param() const { return m_param; } | |||
bool allow_weight_preprocess() const { | |||
return m_allow_weight_preprocess; | |||
} | |||
megdnn::Algorithm* get_algorithm_from_desc( | |||
const megdnn::Algorithm::Info::Desc& desc) const { | |||
return m_megdnn_opr->get_algorithm_from_desc(desc); | |||
} | |||
const FixedTensorLayouts& layouts() const { return m_layouts; } | |||
ImplAlgo choose_by_heuristic(bool reproducible = false) const; | |||
ImplExecutionPolicy choose_by_heuristic( | |||
bool reproducible = false) const; | |||
//! get all candidate algos, and the one choose_by_heuristic() is | |||
//! put first | |||
std::vector<ImplAlgo> get_all_candidates() const; | |||
//! get candidate algos with workspace limit. | |||
std::vector<ImplAlgo> get_all_candidates_with_workspace_limit() const; | |||
//! get workspace size required for specific algo | |||
size_t get_workspace_size_bytes(ImplAlgo algo) const; | |||
//! get workspace size required for specific execution policy | |||
size_t get_workspace_size_bytes( | |||
const ImplExecutionPolicy& policy) const; | |||
/*! | |||
* \brief profile a single algorithm | |||
@@ -106,28 +123,59 @@ class AlgoChooser { | |||
* timeout used during profiling | |||
*/ | |||
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( | |||
ImplAlgo algo, double& timeout) const; | |||
const ImplExecutionPolicy& policy, double& timeout) const; | |||
//! get all profile algorithm from cache, return invalid if not exists | |||
ImplAlgo get_profile_result_from_cache(bool require_reproducible) const; | |||
/** | |||
* \brief construct execution policy from cache. | |||
* | |||
* \param require_reproducible select algo which is reproducible | |||
* \param policy execution policy | |||
*/ | |||
void construct_execution_policy_from_cache( | |||
bool require_reproducible, ImplExecutionPolicy& policy) const; | |||
private: | |||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | |||
}; | |||
//! entrance for getting algorithm according to execution strategy | |||
static ImplAlgo get_algo(ExeContext& ctx); | |||
//! get all profile result, either by retrieving cache or profiling | |||
static AlgoChooserProfileCache::Result get_profile_result( | |||
ExeContext& ctx, bool enable_update); | |||
template<typename U> | |||
friend class AlgoChooser; | |||
static ImplAlgo choose_by_profile(ExeContext& ctx, | |||
bool require_reproducible, | |||
bool enable_update = true); | |||
private: | |||
//! entrance for getting algorithm according to execution strategy | |||
static ImplExecutionPolicy get_policy(ExeContext& ctx); | |||
//! profile and save to cache | |||
static void profile(ExeContext& ctx, bool require_reproducible); | |||
static ImplExecutionPolicy choose_by_profile(ExeContext& ctx, | |||
bool require_reproducible, | |||
bool enable_update = true); | |||
/** | |||
* flatten search space in postorder traversal | |||
* The subopr search construct a search tree | |||
* | |||
* A | |||
* / \ | |||
* B1B2 C | |||
* / \ | |||
* D1D2D3 E | |||
* We use postorder traverse the search tree. | |||
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A | |||
*/ | |||
static std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||
const ExeContext& ctx); | |||
public: | |||
/*! | |||
* \brief setup algorithm and return workspace size | |||
*/ | |||
static size_t setup_algo(const TensorLayoutArray& layouts, Opr* megdnn_opr, | |||
static size_t setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||
const MGBOpr* mgb_opr, | |||
bool allow_weight_preprocess = false); | |||
}; | |||
@@ -28,9 +28,9 @@ namespace mixin { | |||
class AlgoChooserHelper : cg::OperatorNodeMixinBase { | |||
public: | |||
using ExecutionPolicy = megdnn::param::ExecutionPolicy; | |||
using AlgorithmInfo = megdnn::detail::Algorithm::Info; | |||
using AlgorithmPolicy = megdnn::ExecutionPolicy; | |||
using AlgoChooserHook = | |||
std::function<AlgorithmInfo(const cg::OperatorNodeBase*)>; | |||
std::function<AlgorithmPolicy(const cg::OperatorNodeBase*)>; | |||
const ExecutionPolicy& execution_policy() const { | |||
if (!m_policy_accessed) { | |||
@@ -18,6 +18,7 @@ | |||
#include "megbrain/comp_node.h" | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/oprs/base.h" | |||
#include "megdnn/oprs/linalg.h" | |||
#include "megdnn/oprs/nn.h" | |||
@@ -139,7 +140,17 @@ class TimedProfiler { | |||
public: | |||
struct Param { | |||
char algo_name[128]; | |||
struct ExecutionPolicyBlob { | |||
//! enlarge the max size if needed | |||
constexpr static size_t MAX_SIZE_IN_BYTES = 10240; | |||
char data[MAX_SIZE_IN_BYTES]; | |||
uint32_t size; | |||
static ExecutionPolicyBlob serialize( | |||
const megdnn::ExecutionPolicy& policy); | |||
megdnn::ExecutionPolicy deserialize() const; | |||
}; | |||
ExecutionPolicyBlob execution_policy; | |||
size_t workspace; | |||
megdnn::DTypeEnum dtypes[arity]; | |||
CompNode::Locator comp_node_loc; | |||
@@ -20,11 +20,13 @@ | |||
#include "megbrain/opr/basic_arith.h" | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megdnn/dtype.h" | |||
#include "megdnn/oprs/base.h" | |||
#include <gmock/gmock.h> | |||
#include <cmath> | |||
#include <memory> | |||
#include <random> | |||
using namespace mgb; | |||
@@ -37,6 +39,73 @@ using Mode = Param::Mode; | |||
Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION}; | |||
void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, | |||
std::shared_ptr<HostTensorND>& dest, | |||
const opr::ConvolutionBackwardData::Param& param) { | |||
mgb_assert(param.format == Param::Format::NCHW); | |||
auto &&data = *inps[0], &&filter = *inps[1]; | |||
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); | |||
size_t GROUP, ICPG, OCPG, FH, FW; | |||
if (param.sparse == Param::Sparse::DENSE) { | |||
GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1), | |||
FH = filter.shape(2), FW = filter.shape(3); | |||
} else { | |||
mgb_assert(param.sparse == Param::Sparse::GROUP); | |||
GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2), | |||
FH = filter.shape(3), FW = filter.shape(4); | |||
} | |||
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, | |||
size_t dilate) { | |||
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; | |||
}; | |||
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h), | |||
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w); | |||
dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"), | |||
TensorShape{N, OCPG * GROUP, OH, OW}); | |||
auto&& out = *dest; | |||
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), | |||
optr = out.ptr<float>(); | |||
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); | |||
auto ol = out.layout(), fl = filter.layout(); | |||
#define FOR2(a, A, b, B) \ | |||
for (size_t a = 0; a < A; ++a) \ | |||
for (size_t b = 0; b < B; ++b) | |||
#define FOR3(a, A, b, B, c, C) \ | |||
FOR2(a, A, b, B) \ | |||
for (size_t c = 0; c < C; ++c) | |||
FOR3(n, N, group, GROUP, icg, ICPG) | |||
FOR2(ih, IH, iw, IW) { | |||
float scale = *(dptr++); | |||
FOR3(ocg, OCPG, fh, FH, fw, FW) { | |||
auto oc_tot = group * OCPG + ocg; | |||
int oh = int(ih * param.stride_h + fh * param.dilate_h) - | |||
int(param.pad_h), | |||
ow = int(iw * param.stride_w + fw * param.dilate_w) - | |||
int(param.pad_w); | |||
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) && | |||
ow < static_cast<int>(OW)) { | |||
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + | |||
oh * ol.stride[2] + ow; | |||
size_t flt_off = 0; | |||
if (param.sparse == Param::Convolution::Sparse::DENSE) { | |||
flt_off = icg * fl.stride[0] + | |||
ocg * fl.stride[1] + fh * fl.stride[2] + fw; | |||
} else { | |||
flt_off = group * fl.stride[0] + icg * fl.stride[1] + | |||
ocg * fl.stride[2] + fh * fl.stride[3] + fw; | |||
} | |||
optr[out_off] += scale * fptr[flt_off]; | |||
} | |||
} | |||
} | |||
#undef FOR3 | |||
#undef FOR2 | |||
} | |||
void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, | |||
std::shared_ptr<HostTensorND>& out, | |||
const opr::ConvolutionBackwardFilter::Param& param) { | |||
@@ -370,7 +439,8 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||
PersistentCacheHook cache_hook{on_get}; | |||
#if MGB_ENABLE_FASTRUN | |||
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { | |||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||
S::PROFILE_HEURISTIC}) { | |||
#else | |||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | |||
#endif | |||
@@ -406,6 +476,95 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||
} | |||
} | |||
TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) { | |||
REQUIRE_GPU(1); | |||
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1}; | |||
param.compute_mode = Param::ComputeMode::FLOAT32; | |||
using Policy = opr::Convolution::ExecutionPolicy; | |||
using S = Policy::Strategy; | |||
auto gen_bfp16 = [](HostTensorND& dest) { | |||
RNGxorshf rng{next_rand_seed()}; | |||
auto rand_real = [&rng]() { | |||
std::uniform_real_distribution<float> dist(-1, 1); | |||
return dist(rng); | |||
}; | |||
auto ptr = dest.ptr<dt_bfloat16>(); | |||
size_t elems = dest.shape().total_nr_elems(); | |||
for (size_t i = 0; i < elems; i++) { | |||
ptr[i] = dt_bfloat16(rand_real()); | |||
} | |||
}; | |||
auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src) | |||
-> std::shared_ptr<HostTensorND> { | |||
auto ret = std::make_shared<HostTensorND>( | |||
src->comp_node(), src->shape(), dtype::BFloat16{}); | |||
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { | |||
ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i]; | |||
} | |||
return ret; | |||
}; | |||
auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src) | |||
-> std::shared_ptr<HostTensorND> { | |||
auto ret = std::make_shared<HostTensorND>( | |||
src->comp_node(), src->shape(), dtype::Float32{}); | |||
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { | |||
ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i]; | |||
} | |||
return ret; | |||
}; | |||
int nr_get = 0; | |||
auto on_get = [&nr_get](const std::string&, const void*, size_t, | |||
const void*, size_t) { ++nr_get; }; | |||
PersistentCacheHook cache_hook{on_get}; | |||
#if MGB_ENABLE_FASTRUN | |||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||
S::PROFILE_HEURISTIC}) { | |||
#else | |||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | |||
#endif | |||
using Checker = AutoOprChecker<2, 1>; | |||
auto make_graph = [&](const Checker::SymInpArray& inputs) | |||
-> Checker::SymOutArray { | |||
Policy policy; | |||
policy.strategy = strategy; | |||
return {opr::ConvolutionBackwardData::make_deconv( | |||
inputs[0], inputs[1], param, policy)}; | |||
}; | |||
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { | |||
std::shared_ptr<HostTensorND> out; | |||
conv_bwd_data_brute( | |||
{bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out, | |||
param); | |||
dest[0] = *f32_to_bf16(out); | |||
}; | |||
Checker::RunOptions opt; | |||
opt.outputs_max_err = 1e-3; | |||
nr_get = 0; | |||
Checker(make_graph, fwd) | |||
.disable_grad_check() | |||
.set_input_dtype(0, dtype::BFloat16{}) | |||
.set_input_dtype(1, dtype::BFloat16{}) | |||
.set_input_generator(0, gen_bfp16) | |||
.set_input_generator(1, gen_bfp16) | |||
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt) | |||
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt) | |||
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt); | |||
if (strategy == S::HEURISTIC) { | |||
ASSERT_EQ(0, nr_get); | |||
} else { | |||
ASSERT_LT(0, nr_get); | |||
} | |||
} | |||
} | |||
TEST(TestOprDNN, Deconvolution) { | |||
// dilated grouped deconv | |||
using Checker = AutoOprChecker<2, 1>; | |||
@@ -420,55 +579,9 @@ TEST(TestOprDNN, Deconvolution) { | |||
}; | |||
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { | |||
auto &&data = *inp[0], &&filter = *inp[1]; | |||
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); | |||
size_t GROUP = filter.shape(0), ICPG = filter.shape(1), | |||
OCPG = filter.shape(2), FH = filter.shape(3), | |||
FW = filter.shape(4); | |||
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, | |||
size_t dilate) { | |||
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; | |||
}; | |||
auto &&out = dest[0]; | |||
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, | |||
param.dilate_h), | |||
OW = get_shp(IW, FW, param.stride_w, param.pad_w, | |||
param.dilate_w); | |||
out.resize({N, OCPG * GROUP, OH, OW}); | |||
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), | |||
optr = out.ptr<float>(); | |||
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); | |||
auto ol = out.layout(), fl = filter.layout(); | |||
#define FOR2(a, A, b, B) \ | |||
for (size_t a = 0; a < A; ++a) \ | |||
for (size_t b = 0; b < B; ++b) | |||
#define FOR3(a, A, b, B, c, C) \ | |||
FOR2(a, A, b, B) \ | |||
for (size_t c = 0; c < C; ++c) | |||
FOR3(n, N, group, GROUP, icg, ICPG) | |||
FOR2(ih, IH, iw, IW) { | |||
float scale = *(dptr++); | |||
FOR3(ocg, OCPG, fh, FH, fw, FW) { | |||
auto oc_tot = group * OCPG + ocg; | |||
int oh = int(ih * param.stride_h + fh * param.dilate_h) - | |||
int(param.pad_h), | |||
ow = int(iw * param.stride_w + fw * param.dilate_w) - | |||
int(param.pad_w); | |||
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) && | |||
ow < static_cast<int>(OW)) { | |||
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + | |||
oh * ol.stride[2] + ow, | |||
flt_off = group * fl.stride[0] + icg * fl.stride[1] + | |||
ocg * fl.stride[2] + fh * fl.stride[3] + fw; | |||
optr[out_off] += scale * fptr[flt_off]; | |||
} | |||
} | |||
} | |||
#undef FOR3 | |||
#undef FOR2 | |||
std::shared_ptr<HostTensorND> out; | |||
conv_bwd_data_brute({inp[0], inp[1]}, out, param); | |||
dest[0] = *out; | |||
}; | |||
Checker::RunOptions opt; | |||
@@ -1547,7 +1660,8 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) { | |||
PersistentCacheHook cache_hook{on_get}; | |||
#if MGB_ENABLE_FASTRUN | |||
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { | |||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||
S::PROFILE_HEURISTIC}) { | |||
#else | |||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | |||
#endif | |||
@@ -2004,29 +2118,34 @@ TEST(TestOprDNN, HeuristicReproducible) { | |||
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | |||
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | |||
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||
static_cast<opr::ConvolutionBackwardFilter*>( | |||
bwd_flt->owner_opr()) | |||
->megdnn_opr()) | |||
->execution_policy() | |||
.algo; | |||
->megdnn_opr()); | |||
auto&& algo = megdnn_opr->execution_policy().algo; | |||
megdnn::Algorithm* palgo = | |||
megdnn_opr->get_algorithm_from_desc(algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
if (strategy == S::HEURISTIC_REPRODUCIBLE) { | |||
EXPECT_TRUE(algo.is_reproducible); | |||
EXPECT_TRUE(palgo->is_reproducible()); | |||
} | |||
algo_name0 = algo.name.c_str(); | |||
algo_name0 = palgo->name(); | |||
} | |||
{ | |||
Checker checker(make_graph, fwd); | |||
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) | |||
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | |||
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | |||
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||
static_cast<opr::ConvolutionBackwardFilter*>( | |||
bwd_flt->owner_opr()) | |||
->megdnn_opr()) | |||
->execution_policy() | |||
.algo; | |||
algo_name1 = algo.name.c_str(); | |||
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||
static_cast<opr::ConvolutionBackwardFilter*>( | |||
bwd_flt->owner_opr()) | |||
->megdnn_opr()); | |||
auto&& algo = megdnn_opr->execution_policy().algo; | |||
megdnn::Algorithm* palgo = | |||
megdnn_opr->get_algorithm_from_desc(algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
algo_name1 = palgo->name(); | |||
} | |||
EXPECT_TRUE(algo_name0 == algo_name1); | |||
} | |||
@@ -2286,6 +2405,8 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) { | |||
MockAlgorithm algo; | |||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | |||
.WillRepeatedly(Return(&algo)); | |||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||
.WillRepeatedly(Return(&algo)); | |||
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | |||
.WillRepeatedly(Return(0)); | |||
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | |||
@@ -2318,6 +2439,9 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)) | |||
.WillRepeatedly(Return(filter_layout)); | |||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||
.WillRepeatedly(Return(&algo)); | |||
Expectation algo_call = | |||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | |||
.WillOnce(Return(&algo)); | |||
@@ -2349,7 +2473,6 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||
pf->tensors[0].ptr<float>()[0] = 114.514f; | |||
pf->tensors[1].ptr<float>()[0] = 1926.0817f; | |||
})); | |||
// Run the graph multiple times. | |||
for (int i = 0; i < 3; i++) { | |||
if (i > 0) { | |||
@@ -2381,6 +2504,8 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||
MockAlgorithm algo; | |||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | |||
.WillRepeatedly(Return(&algo)); | |||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||
.WillRepeatedly(Return(&algo)); | |||
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | |||
.WillRepeatedly(Return(0)); | |||
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | |||
@@ -16,157 +16,157 @@ namespace { | |||
const char* EXPECTED_TEXT_OUT_REC[3] = { | |||
// rec level 0 | |||
R"OUTPUT( | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | |||
val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: [2.352, 0.1114, -0.2721] s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: [2] s | |||
[i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s | |||
val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s | |||
[i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s | |||
val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | |||
val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: [0.05521, 0.724, 1.134] s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: [2] s | |||
[i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s | |||
val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s | |||
[i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s | |||
val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | |||
val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: [-0.5069, 0.4525, 0.1695] s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: [2] s | |||
[i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s | |||
val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s | |||
[i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s | |||
val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | |||
val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: [-0.03637, 2.111, 0.3236] s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: [2] s | |||
[i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s | |||
val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s | |||
[i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s | |||
val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | |||
var1 produced: name=var1 layout={5(4),4(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={5(4),4(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s | |||
var17 produced: name=var17 layout={5(4),4(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={5(4),4(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | |||
val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s | |||
var11 produced: name=var11 layout={1(4),4(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(4),4(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s | |||
var13 produced: name=var13 layout={5(0),4(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={5(0),4(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: [-1.199, -1.02, 1.098, -1.472] s | |||
[i1]var9: <host value[s]> [5, 4] s | |||
val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s | |||
var15 produced: name=var15 layout={5(4),4(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={5(4),4(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: [2] s | |||
[i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s | |||
val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s | |||
var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={5(4),4(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: [0, 0, 3.401, 0, 0, ...] s | |||
[i1]var17: [0, 0, 1.098, 0, 0, ...] s | |||
@@ -176,33 +176,33 @@ var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 | |||
// rec level 1 | |||
R"OUTPUT( | |||
==== begin lazy value recording | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: <see lazy value below> s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: <see lazy value below> s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: <see lazy value below> s | |||
val: <see lazy value below> s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: <see lazy value below> s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: <see lazy value below> s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: <see lazy value below> s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: <see lazy value below> s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: <see lazy value below> s | |||
[i1]var13: <see lazy value below> s | |||
val: <see lazy value below> s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: <see lazy value below> s | |||
[i1]var17: <see lazy value below> s | |||
@@ -242,33 +242,33 @@ var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
// rec level 2 | |||
R"OUTPUT( | |||
==== begin lazy value recording | |||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||
deps: | |||
val: <see lazy value below> s | |||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||
deps: | |||
val: <see lazy value below> s | |||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||
deps: | |||
[i0]var1: <see lazy value below> s | |||
val: <see lazy value below> s | |||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||
deps: | |||
[i0]var1: <see lazy value below> s | |||
[i1]var5: <host value[s]> [0] s | |||
[i2]var7: <host value[s]> [1] s | |||
val: <see lazy value below> s | |||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||
deps: | |||
[i0]var11: <see lazy value below> s | |||
[i1]var9: <host value[s]> [2, 3] s | |||
val: <see lazy value below> s | |||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||
deps: | |||
[i0]var3: <see lazy value below> s | |||
[i1]var13: <see lazy value below> s | |||
val: <see lazy value below> s | |||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||
deps: | |||
[i0]var15: <see lazy value below> s | |||
[i1]var17: <see lazy value below> s | |||