GitOrigin-RevId: 743846f645
tags/v1.3.0
@@ -330,6 +330,8 @@ struct TensorLayout : public TensorShape { | |||||
/* =================== properties =================== */ | /* =================== properties =================== */ | ||||
std::string to_string() const; | std::string to_string() const; | ||||
std::string serialize() const; | |||||
#endif // MEGDNN_CC_HOST | #endif // MEGDNN_CC_HOST | ||||
/*! | /*! | ||||
@@ -11,6 +11,7 @@ | |||||
*/ | */ | ||||
#pragma once | #pragma once | ||||
#include <type_traits> | |||||
#include "megdnn/basic_types.h" | #include "megdnn/basic_types.h" | ||||
#include "megdnn/handle.h" | #include "megdnn/handle.h" | ||||
@@ -144,8 +145,11 @@ public: | |||||
return {{handle_type(), type(), param()}, name(), is_reproducible()}; | return {{handle_type(), type(), param()}, name(), is_reproducible()}; | ||||
} | } | ||||
Info::Desc desc() const { return {handle_type(), type(), param()}; } | |||||
template <typename T> | template <typename T> | ||||
static void serialize_write_pod(const T& val, std::string& result) { | static void serialize_write_pod(const T& val, std::string& result) { | ||||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||||
result.append(reinterpret_cast<const char*>(&val), sizeof(T)); | result.append(reinterpret_cast<const char*>(&val), sizeof(T)); | ||||
} | } | ||||
@@ -155,6 +159,7 @@ public: | |||||
template <typename T> | template <typename T> | ||||
static T deserialize_read_pod(const std::string& data, size_t offset = 0) { | static T deserialize_read_pod(const std::string& data, size_t offset = 0) { | ||||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||||
T ret; | T ret; | ||||
//! A pointer to an object or incomplete type may be converted to a | //! A pointer to an object or incomplete type may be converted to a | ||||
//! pointer to a different object or incomplete type. If the resulting | //! pointer to a different object or incomplete type. If the resulting | ||||
@@ -167,10 +172,69 @@ public: | |||||
return ret; | return ret; | ||||
} | } | ||||
template <typename T> | |||||
static T deserialize_read_pod(const char* data, size_t offset = 0) { | |||||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||||
T ret; | |||||
//! A pointer to an object or incomplete type may be converted to a | |||||
//! pointer to a different object or incomplete type. If the resulting | |||||
//! pointer is not correctly aligned for the pointed-to type, the | |||||
//! behavior is undefined. | |||||
//! | |||||
//! so here we should use memcpy instead of | |||||
//! *reinterpret_cast<const T*>(&data[offset]); | |||||
memcpy(&ret, data + offset, sizeof(T)); | |||||
return ret; | |||||
} | |||||
enum class OprType : uint32_t { | |||||
MATRIX_MUL_FORWARD, | |||||
BATCHED_MATRIX_MUL_FORWARD, | |||||
CONVOLUTION_FORWARD, | |||||
CONVOLUTION_BACKWARD_DATA, | |||||
CONVOLUTION_BACKWARD_FILTER, | |||||
CONVOLUTION3D_FORWARD, | |||||
CONVOLUTION3D_BACKWARD_DATA, | |||||
CONVOLUTION3D_BACKWARD_FILTER, | |||||
LOCAL_SHARE_FORWARD, | |||||
LOCAL_SHARE_BACKWARD_DATA, | |||||
LOCAL_SHARE_BACKWARD_FILTER, | |||||
DEFORMABLE_CONV_FORWARD, | |||||
DEFORMABLE_CONV_BACKWARD_DATA, | |||||
DEFORMABLE_CONV_BACKWARD_FILTER, | |||||
CONVBIAS_FORWARD, | |||||
BATCH_CONV_FORWARD, | |||||
}; | |||||
struct SearchItem { | |||||
OprType opr_type; | |||||
//! serialized param | |||||
std::string param; | |||||
TensorLayoutArray layouts; | |||||
}; | |||||
/** | |||||
* \brief get subopr list of the algo | |||||
* | |||||
* \param layouts origin layouts of the parent opr | |||||
* \param opr parent opr | |||||
*/ | |||||
virtual std::vector<SearchItem> get_subopr_list(const TensorLayoutArray&, | |||||
const OperatorBase*) const { | |||||
return {}; | |||||
} | |||||
protected: | protected: | ||||
Handle::HandleType m_handle_type = Handle::HandleType::NAIVE; | Handle::HandleType m_handle_type = Handle::HandleType::NAIVE; | ||||
}; | }; | ||||
//! policy for executing the operator | |||||
struct ExecutionPolicy { | |||||
//! INVALID_ALGO_TYPE algo_type means using heuristic | |||||
Algorithm::Info::Desc algo; | |||||
std::vector<ExecutionPolicy> sub_policy; | |||||
}; | |||||
/*! | /*! | ||||
* \brief define Algorithm and ExecutionPolicy for oprs that have | * \brief define Algorithm and ExecutionPolicy for oprs that have | ||||
* multiple impl algos | * multiple impl algos | ||||
@@ -198,12 +262,6 @@ public: | |||||
*/ | */ | ||||
virtual const char* get_algorithm_set_name() const = 0; | virtual const char* get_algorithm_set_name() const = 0; | ||||
//! policy for executing the operator | |||||
struct ExecutionPolicy { | |||||
//! INVALID_ALGO_TYPE algo_type means using heuristic | |||||
AlgorithmInfo algo; | |||||
}; | |||||
ExecutionPolicy& execution_policy() { return m_execution_policy; } | ExecutionPolicy& execution_policy() { return m_execution_policy; } | ||||
const ExecutionPolicy& execution_policy() const { | const ExecutionPolicy& execution_policy() const { | ||||
@@ -464,6 +522,9 @@ protected: | |||||
bool reproducible = false) = 0; | bool reproducible = false) = 0; | ||||
}; | }; | ||||
} // namespace detail | } // namespace detail | ||||
using Algorithm = detail::Algorithm; | |||||
using ExecutionPolicy = detail::ExecutionPolicy; | |||||
} // namespace megdnn | } // namespace megdnn | ||||
#include "megdnn/internal/visibility_epilogue.h" | #include "megdnn/internal/visibility_epilogue.h" | ||||
@@ -25,17 +25,17 @@ namespace megdnn { | |||||
*/ | */ | ||||
template <class Opr, typename... Args> | template <class Opr, typename... Args> | ||||
typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { | typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { | ||||
typename Opr::AlgorithmInfo ret; | |||||
typename Opr::AlgorithmDesc ret; | |||||
auto set = opr->execution_policy().algo; | auto set = opr->execution_policy().algo; | ||||
if (set.valid()) { | if (set.valid()) { | ||||
ret = set; | ret = set; | ||||
} else { | } else { | ||||
ret = opr->get_algorithm_info_heuristic( | ret = opr->get_algorithm_info_heuristic( | ||||
std::forward<Args>(args)..., std::numeric_limits<size_t>::max(), | std::forward<Args>(args)..., std::numeric_limits<size_t>::max(), | ||||
false); | |||||
false).desc; | |||||
} | } | ||||
return static_cast<typename Opr::AlgoBase*>( | return static_cast<typename Opr::AlgoBase*>( | ||||
opr->get_algorithm_from_desc(ret.desc)); | |||||
opr->get_algorithm_from_desc(ret)); | |||||
} | } | ||||
/*! | /*! | ||||
@@ -46,7 +46,7 @@ template <class Opr, typename... Args> | |||||
typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) { | typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) { | ||||
auto set = opr->execution_policy().algo; | auto set = opr->execution_policy().algo; | ||||
if (set.valid()) { | if (set.valid()) { | ||||
return opr->algo_pack().construct_and_get_algo(set.desc); | |||||
return opr->algo_pack().construct_and_get_algo(set); | |||||
} else { | } else { | ||||
return static_cast<typename Opr::AlgoBase*>( | return static_cast<typename Opr::AlgoBase*>( | ||||
opr->get_algorithm_heuristic(std::forward<Args>(args)..., | opr->get_algorithm_heuristic(std::forward<Args>(args)..., | ||||
@@ -20,6 +20,7 @@ | |||||
#include <mutex> | #include <mutex> | ||||
#include <numeric> | #include <numeric> | ||||
#include <tuple> | #include <tuple> | ||||
#include <type_traits> | |||||
using namespace megdnn; | using namespace megdnn; | ||||
@@ -35,6 +36,26 @@ class DefaultErrorHandler final : public ErrorHandler { | |||||
#endif | #endif | ||||
} | } | ||||
}; | }; | ||||
template <typename T> | |||||
void serialize_pod(const T& val, std::string& result) { | |||||
static_assert(std::is_standard_layout<T>::value, "invalid type"); | |||||
result.append(reinterpret_cast<const char*>(&val), sizeof(T)); | |||||
} | |||||
template <typename T> | |||||
void serialize_vec(const T* val, size_t size, std::string& result) { | |||||
result.append(reinterpret_cast<const char*>(val), sizeof(T) * size); | |||||
} | |||||
template <typename T> | |||||
T deserialize_pod(const std::string& data, size_t& offset) { | |||||
T ret; | |||||
memcpy(&ret, data.data() + offset, sizeof(T)); | |||||
offset += sizeof(T); | |||||
return ret; | |||||
} | |||||
} // namespace | } // namespace | ||||
ErrorHandler* ErrorHandler::sm_inst; | ErrorHandler* ErrorHandler::sm_inst; | ||||
@@ -126,17 +147,23 @@ bool TensorShape::eq_shape(const TensorShape& rhs) const { | |||||
size_t eq = 0; | size_t eq = 0; | ||||
switch (ndim) { | switch (ndim) { | ||||
case 7: | case 7: | ||||
eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU | |||||
eq += shape[6] == rhs.shape[6]; | |||||
MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU | |||||
eq += shape[5] == rhs.shape[5]; | |||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU | |||||
eq += shape[4] == rhs.shape[4]; | |||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU | |||||
eq += shape[3] == rhs.shape[3]; | |||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU | |||||
eq += shape[2] == rhs.shape[2]; | |||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU | |||||
eq += shape[1] == rhs.shape[1]; | |||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
eq += shape[0] == rhs.shape[0]; | eq += shape[0] == rhs.shape[0]; | ||||
} | } | ||||
@@ -435,8 +462,8 @@ bool TensorLayout::try_reshape(TensorLayout& result, | |||||
for (size_t i = 0; i < tshp.ndim; ++i) { | for (size_t i = 0; i < tshp.ndim; ++i) { | ||||
if (!tshp.shape[i]) { | if (!tshp.shape[i]) { | ||||
megdnn_throw_if(!format.is_default(), tensor_reshape_error, | megdnn_throw_if(!format.is_default(), tensor_reshape_error, | ||||
megdnn_mangle(ssprintf("bad target tshp: %s", | |||||
tshp.to_string().c_str()))); | |||||
megdnn_mangle(ssprintf("bad target tshp: %s", | |||||
tshp.to_string().c_str()))); | |||||
is_empty_shape = true; | is_empty_shape = true; | ||||
break; | break; | ||||
} | } | ||||
@@ -510,8 +537,36 @@ std::string TensorLayout::to_string() const { | |||||
rst.append(" @ "); | rst.append(" @ "); | ||||
rst.append(format.impl()->to_string()); | rst.append(format.impl()->to_string()); | ||||
} | } | ||||
rst.append(std::string(" ") + dtype.name()); | |||||
rst.append("}"); | rst.append("}"); | ||||
return rst; | return rst; | ||||
} | } | ||||
std::string TensorLayout::serialize() const { | |||||
std::string rst; | |||||
serialize_pod<size_t>(ndim, rst); | |||||
serialize_vec<size_t>(shape, ndim, rst); | |||||
serialize_vec<ptrdiff_t>(stride, ndim, rst); | |||||
rst.append(format.impl()->to_string()); | |||||
//! serialize dtype | |||||
serialize_pod(dtype.enumv(), rst); | |||||
if (dtype.has_param()) { | |||||
switch (dtype.enumv()) { | |||||
#define cb(_dt) \ | |||||
case DTypeTrait<dtype::_dt>::enumv: \ | |||||
serialize_pod(dtype::_dt::downcast_from(dtype).param(), rst); \ | |||||
break; | |||||
MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) | |||||
#undef cb | |||||
default: | |||||
megdnn_assert(false, | |||||
"cannot serialize unknown parameterized DType"); | |||||
break; | |||||
} | |||||
} | |||||
return rst; | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -24,7 +24,7 @@ bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available( | |||||
const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
MatrixMulForwardImpl mm{args.opr->handle()}; | MatrixMulForwardImpl mm{args.opr->handle()}; | ||||
mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; | mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; | ||||
mm.execution_policy() = {m_algorithm->info()}; | |||||
mm.execution_policy() = {m_algorithm->desc(), {}}; | |||||
auto mm_layout_a = args.layout_a.remove_axis(0); | auto mm_layout_a = args.layout_a.remove_axis(0); | ||||
auto mm_layout_b = args.layout_b.remove_axis(0); | auto mm_layout_b = args.layout_b.remove_axis(0); | ||||
@@ -39,7 +39,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes( | |||||
auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | ||||
mm_opr->param() = {args.opr->param().transposeA, | mm_opr->param() = {args.opr->param().transposeA, | ||||
args.opr->param().transposeB}; | args.opr->param().transposeB}; | ||||
mm_opr->execution_policy() = {m_algorithm->info()}; | |||||
mm_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||||
return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, | return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, | ||||
args.layout_c); | args.layout_c); | ||||
@@ -50,7 +50,7 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec( | |||||
auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); | ||||
mm_opr->param() = {args.opr->param().transposeA, | mm_opr->param() = {args.opr->param().transposeA, | ||||
args.opr->param().transposeB}; | args.opr->param().transposeB}; | ||||
mm_opr->execution_policy() = {m_algorithm->info()}; | |||||
mm_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||||
rep(n, N) { | rep(n, N) { | ||||
TensorND A_, B_, C_; | TensorND A_, B_, C_; | ||||
auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { | auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { | ||||
@@ -47,7 +47,7 @@ ConvBiasForwardImpl::AlgoBFloat16::float_args( | |||||
change_dtype(fdst); | change_dtype(fdst); | ||||
opr->param() = args.opr->param(); | opr->param() = args.opr->param(); | ||||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | opr->param().compute_mode = Param::ComputeMode::DEFAULT; | ||||
opr->execution_policy() = {m_impl->info()}; | |||||
opr->execution_policy() = {m_impl->desc(), {}}; | |||||
return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); | return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); | ||||
} | } | ||||
@@ -110,7 +110,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||||
auto convbias_opr = args.handle->create_operator<ConvBias>(); | auto convbias_opr = args.handle->create_operator<ConvBias>(); | ||||
convbias_opr->param() = args.opr->param(); | convbias_opr->param() = args.opr->param(); | ||||
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | ||||
convbias_opr->execution_policy() = {m_impl->info()}; | |||||
convbias_opr->execution_policy() = {m_impl->desc(), {}}; | |||||
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, | convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, | ||||
fdst_tensor, nullptr, cvter.workspace()); | fdst_tensor, nullptr, cvter.workspace()); | ||||
} | } | ||||
@@ -46,12 +46,8 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||||
megdnn_assert(all_algos_data == all_algos.data()); | megdnn_assert(all_algos_data == all_algos.data()); | ||||
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul | non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul | ||||
size_t algo_size = all_algos.size(); | |||||
for (size_t i=0; i<algo_size; ++i) { | |||||
bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i])); | |||||
all_algos.push_back(bfloat16_refhold.back().get()); | |||||
bfloat16_algos.push_back(bfloat16_refhold.back().get()); | |||||
} | |||||
all_algos.push_back(&bfloat16); | |||||
bfloat16_algos.push_back(&bfloat16); | |||||
for (auto&& algo : all_algos) { | for (auto&& algo : all_algos) { | ||||
m_all_algos_map.emplace(algo->info().desc, algo); | m_all_algos_map.emplace(algo->info().desc, algo); | ||||
@@ -170,28 +170,22 @@ public: | |||||
class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { | class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { | ||||
public: | public: | ||||
AlgoBFloat16(ConvolutionBackwardDataImpl::AlgoBase*); | |||||
bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
void exec(const ExecArgs& args) const override; | void exec(const ExecArgs& args) const override; | ||||
const char* name() const override { return m_name.c_str(); } | |||||
std::vector<SearchItem> get_subopr_list( | |||||
const TensorLayoutArray& layouts, | |||||
const OperatorBase* opr) const override; | |||||
const char* name() const override { | |||||
return "CONVOLUTION_BACKWARD_DATD_BFLOAT16"; | |||||
} | |||||
bool is_reproducible() const override { return true; } | bool is_reproducible() const override { return true; } | ||||
private: | private: | ||||
std::string m_name; | |||||
ConvolutionBackwardDataImpl::AlgoBase* m_algorithm = nullptr; | |||||
SizeArgs float_args(const SizeArgs& args, ConvolutionBackwardDataImpl* opr, | |||||
TensorLayout& fsrc, TensorLayout& ffilter, | |||||
TensorLayout& fdst) const; | |||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | ||||
MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) | MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) | ||||
std::string param() const override { | |||||
std::string ret; | |||||
serialize_write_pod(m_algorithm, ret); | |||||
return ret; | |||||
} | |||||
}; | }; | ||||
//! implement group conv by another algo | //! implement group conv by another algo | ||||
@@ -237,7 +231,7 @@ public: | |||||
AlgoChanwiseSmall chanwise_small; | AlgoChanwiseSmall chanwise_small; | ||||
std::vector<AlgoGroupConvGeneral> gconv; | std::vector<AlgoGroupConvGeneral> gconv; | ||||
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | ||||
std::vector<std::unique_ptr<AlgoBFloat16>> bfloat16_refhold; | |||||
AlgoBFloat16 bfloat16; | |||||
std::vector<AlgoBase*> | std::vector<AlgoBase*> | ||||
//! all algorithms | //! all algorithms | ||||
@@ -17,33 +17,39 @@ using namespace megdnn; | |||||
using namespace cuda; | using namespace cuda; | ||||
using namespace convolution; | using namespace convolution; | ||||
ConvolutionBackwardDataImpl::AlgoBFloat16::AlgoBFloat16( | |||||
ConvolutionBackwardDataImpl::AlgoBase* algorithm) | |||||
: m_algorithm(algorithm) { | |||||
megdnn_assert_internal(algorithm); | |||||
m_name = ssprintf("CONVOLUTION_BACKWARD_DATD_BFLOAT16:%s", | |||||
m_algorithm->name()); | |||||
} | |||||
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs | |||||
ConvolutionBackwardDataImpl::AlgoBFloat16::float_args( | |||||
const SizeArgs& args, ConvolutionBackwardDataImpl* opr, | |||||
TensorLayout& ffilter, TensorLayout& fdiff, TensorLayout& fgrad) const { | |||||
ffilter = *args.filter_layout; | |||||
fdiff = *args.diff_layout; | |||||
fgrad = *args.grad_layout; | |||||
namespace { | |||||
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> sub_opr_config( | |||||
const TensorLayoutArray& layouts, | |||||
const ConvolutionBackwardDataImpl* opr) { | |||||
megdnn_assert(layouts.size() >= 3); | |||||
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> ret; | |||||
ret.first = layouts; | |||||
auto change_dtype = [](TensorLayout& layout) { | auto change_dtype = [](TensorLayout& layout) { | ||||
if (layout.dtype == dtype::BFloat16()) { | if (layout.dtype == dtype::BFloat16()) { | ||||
layout.dtype = dtype::Float32(); | layout.dtype = dtype::Float32(); | ||||
} | } | ||||
}; | }; | ||||
change_dtype(ffilter); | |||||
change_dtype(fdiff); | |||||
change_dtype(fgrad); | |||||
opr->param() = args.opr->param(); | |||||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||||
opr->execution_policy() = {m_algorithm->info()}; | |||||
return SizeArgs(opr, ffilter, fdiff, fgrad); | |||||
change_dtype(ret.first[0]); | |||||
change_dtype(ret.first[1]); | |||||
change_dtype(ret.first[2]); | |||||
ret.second = opr->param(); | |||||
ret.second.compute_mode = | |||||
ConvolutionBackwardData::Param::ComputeMode::DEFAULT; | |||||
return ret; | |||||
} | |||||
} | |||||
std::vector<Algorithm::SearchItem> | |||||
ConvolutionBackwardDataImpl::AlgoBFloat16::get_subopr_list( | |||||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||||
auto&& config = sub_opr_config( | |||||
layouts, static_cast<const ConvolutionBackwardDataImpl*>(opr)); | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(config.second, param_str); | |||||
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str, | |||||
config.first}}; | |||||
} | } | ||||
bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( | bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( | ||||
@@ -51,24 +57,30 @@ bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( | |||||
TensorLayout ffilter, fdiff, fgrad; | TensorLayout ffilter, fdiff, fgrad; | ||||
auto conv_back_data_opr = | auto conv_back_data_opr = | ||||
args.handle->create_operator<ConvolutionBackwardData>(); | args.handle->create_operator<ConvolutionBackwardData>(); | ||||
SizeArgs fargs = float_args( | |||||
args, | |||||
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()), | |||||
ffilter, fdiff, fgrad); | |||||
auto&& config = sub_opr_config( | |||||
{*args.filter_layout, *args.diff_layout, *args.grad_layout}, | |||||
args.opr); | |||||
conv_back_data_opr->param() = config.second; | |||||
return args.diff_layout->dtype == args.filter_layout->dtype && | return args.diff_layout->dtype == args.filter_layout->dtype && | ||||
args.diff_layout->dtype == dtype::BFloat16() && | args.diff_layout->dtype == dtype::BFloat16() && | ||||
m_algorithm->is_available(fargs); | |||||
get_algorithm(static_cast<ConvolutionBackwardDataImpl*>( | |||||
conv_back_data_opr.get()), | |||||
config.first[0], config.first[1], config.first[2]); | |||||
} | } | ||||
WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( | WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( | ||||
void* ptr, const SizeArgs& args) const { | void* ptr, const SizeArgs& args) const { | ||||
TensorLayout ffilter, fdiff, fgrad; | |||||
auto conv_back_data_opr = | auto conv_back_data_opr = | ||||
args.handle->create_operator<ConvolutionBackwardData>(); | args.handle->create_operator<ConvolutionBackwardData>(); | ||||
SizeArgs fargs = float_args( | |||||
args, | |||||
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()), | |||||
ffilter, fdiff, fgrad); | |||||
if (args.opr->execution_policy().algo.valid()) { | |||||
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||||
conv_back_data_opr->execution_policy() = | |||||
args.opr->execution_policy().sub_policy[0]; | |||||
} | |||||
auto&& config = sub_opr_config( | |||||
{*args.filter_layout, *args.diff_layout, *args.grad_layout}, | |||||
args.opr); | |||||
conv_back_data_opr->param() = config.second; | |||||
SmallVector<size_t> sizes; | SmallVector<size_t> sizes; | ||||
auto get_workspace = [&sizes](const TensorLayout& src, | auto get_workspace = [&sizes](const TensorLayout& src, | ||||
const TensorLayout& dst) { | const TensorLayout& dst) { | ||||
@@ -76,10 +88,12 @@ WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( | |||||
sizes.push_back(dst.span().dist_byte()); | sizes.push_back(dst.span().dist_byte()); | ||||
} | } | ||||
}; | }; | ||||
get_workspace(*args.filter_layout, ffilter); | |||||
get_workspace(*args.diff_layout, fdiff); | |||||
get_workspace(*args.grad_layout, fgrad); | |||||
sizes.push_back(m_algorithm->get_workspace_in_bytes(fargs)); | |||||
get_workspace(*args.filter_layout, config.first[0]); | |||||
get_workspace(*args.diff_layout, config.first[1]); | |||||
get_workspace(*args.grad_layout, config.first[2]); | |||||
sizes.push_back(conv_back_data_opr->get_workspace_in_bytes( | |||||
config.first[0], config.first[1], config.first[2])); | |||||
return {ptr, std::move(sizes)}; | return {ptr, std::move(sizes)}; | ||||
} | } | ||||
@@ -103,9 +117,13 @@ void ConvolutionBackwardDataImpl::AlgoBFloat16::exec( | |||||
{ | { | ||||
auto conv_back_data_opr = | auto conv_back_data_opr = | ||||
args.handle->create_operator<ConvolutionBackwardData>(); | args.handle->create_operator<ConvolutionBackwardData>(); | ||||
if (args.opr->execution_policy().algo.valid()) { | |||||
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||||
conv_back_data_opr->execution_policy() = | |||||
args.opr->execution_policy().sub_policy[0]; | |||||
} | |||||
conv_back_data_opr->param() = args.opr->param(); | conv_back_data_opr->param() = args.opr->param(); | ||||
conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | ||||
conv_back_data_opr->execution_policy() = {m_algorithm->info()}; | |||||
conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor, | conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor, | ||||
cvter.workspace()); | cvter.workspace()); | ||||
} | } | ||||
@@ -42,7 +42,7 @@ ConvolutionBackwardFilterImpl::AlgoBFloat16::float_args( | |||||
change_dtype(fgrad); | change_dtype(fgrad); | ||||
opr->param() = args.opr->param(); | opr->param() = args.opr->param(); | ||||
opr->param().compute_mode = Param::ComputeMode::DEFAULT; | opr->param().compute_mode = Param::ComputeMode::DEFAULT; | ||||
opr->execution_policy() = {m_algorithm->info()}; | |||||
opr->execution_policy() = {m_algorithm->desc(), {}}; | |||||
return SizeArgs(opr, fsrc, fdiff, fgrad); | return SizeArgs(opr, fsrc, fdiff, fgrad); | ||||
} | } | ||||
@@ -107,7 +107,7 @@ void ConvolutionBackwardFilterImpl::AlgoBFloat16::exec( | |||||
conv_back_filter_opr->param() = args.opr->param(); | conv_back_filter_opr->param() = args.opr->param(); | ||||
conv_back_filter_opr->param().compute_mode = | conv_back_filter_opr->param().compute_mode = | ||||
Param::ComputeMode::DEFAULT; | Param::ComputeMode::DEFAULT; | ||||
conv_back_filter_opr->execution_policy() = {m_algorithm->info()}; | |||||
conv_back_filter_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||||
conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor, | conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor, | ||||
cvter.workspace()); | cvter.workspace()); | ||||
} | } | ||||
@@ -69,7 +69,7 @@ ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src, | |||||
conv_param.dilate_h, | conv_param.dilate_h, | ||||
conv_param.dilate_w, | conv_param.dilate_w, | ||||
conv_param.compute_mode}; | conv_param.compute_mode}; | ||||
ret.convbias_opr->execution_policy() = {this->execution_policy().algo}; | |||||
ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -102,7 +102,7 @@ ConvolutionForwardImpl::get_algorithm_from_desc( | |||||
conv_param.dilate_h, | conv_param.dilate_h, | ||||
conv_param.dilate_w, | conv_param.dilate_w, | ||||
conv_param.compute_mode}; | conv_param.compute_mode}; | ||||
convbias_opr->execution_policy() = {this->execution_policy().algo}; | |||||
convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||||
return static_cast<ConvBiasForwardImpl*>(convbias_opr.get()) | return static_cast<ConvBiasForwardImpl*>(convbias_opr.get()) | ||||
->get_algorithm_from_desc(desc); | ->get_algorithm_from_desc(desc); | ||||
@@ -160,7 +160,7 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||||
_megdnn_tensor_out grad, | _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | ||||
auto algo = get_algorithm(this, filter.layout, args.filter_meta, | |||||
auto algo = get_algorithm(this, filter.layout, | |||||
diff.layout, grad.layout); | diff.layout, grad.layout); | ||||
algo->check_workspace(args, workspace).exec(args); | algo->check_workspace(args, workspace).exec(args); | ||||
} | } | ||||
@@ -83,6 +83,17 @@ public: | |||||
workspace_limit_in_bytes, reproducible) | workspace_limit_in_bytes, reproducible) | ||||
->info(); | ->info(); | ||||
} | } | ||||
AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& filter, | |||||
const TensorLayout& diff, | |||||
const TensorLayout& grad, | |||||
size_t workspace_limit_in_bytes, | |||||
bool reproducible) { | |||||
return get_algorithm_heuristic(filter, diff, grad, | |||||
workspace_limit_in_bytes, reproducible) | |||||
->info(); | |||||
} | |||||
size_t get_workspace_in_bytes(const TensorLayout& filter, | size_t get_workspace_in_bytes(const TensorLayout& filter, | ||||
const TensorLayout& diff, | const TensorLayout& diff, | ||||
const TensorLayout& grad) override; | const TensorLayout& grad) override; | ||||
@@ -82,7 +82,7 @@ void MatrixMulForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||||
args.opr->handle()->create_operator<MatrixMulForward>(); | args.opr->handle()->create_operator<MatrixMulForward>(); | ||||
matmul_opr->param() = args.opr->param(); | matmul_opr->param() = args.opr->param(); | ||||
matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | ||||
matmul_opr->execution_policy() = {m_algorithm->info()}; | |||||
matmul_opr->execution_policy() = {m_algorithm->desc(), {}}; | |||||
matmul_opr->exec(a, b, c, ctypecvt.workspace()); | matmul_opr->exec(a, b, c, ctypecvt.workspace()); | ||||
} | } | ||||
ctypecvt.comp_to_dst_type(c, args.tensor_c); | ctypecvt.comp_to_dst_type(c, args.tensor_c); | ||||
@@ -1,6 +1,5 @@ | |||||
/** | /** | ||||
* \file dnn/src/fallback/conv_bias/opr_impl.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
g * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | * | ||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | ||||
* | * | ||||
@@ -367,7 +366,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||||
"should be equal"); | "should be equal"); | ||||
auto&& fm = check_layout_fwd(src, filter, dst); | auto&& fm = check_layout_fwd(src, filter, dst); | ||||
auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm); | auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm); | ||||
size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) | size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) | ||||
->megcore_dispatcher() | ->megcore_dispatcher() | ||||
->nr_threads(); | ->nr_threads(); | ||||
@@ -495,7 +494,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_from_desc( | |||||
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | ||||
const NCBKernSizeParam& param, size_t workspace_size) { | const NCBKernSizeParam& param, size_t workspace_size) { | ||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||||
return algo; | return algo; | ||||
} | } | ||||
if (!m_prev_selected_algo || | if (!m_prev_selected_algo || | ||||
@@ -387,7 +387,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_from_desc( | |||||
ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm( | ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm( | ||||
const NCBKernSizeParam& param, size_t workspace_size) { | const NCBKernSizeParam& param, size_t workspace_size) { | ||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||||
return algo; | return algo; | ||||
} | } | ||||
if (!m_prev_selected_algo || | if (!m_prev_selected_algo || | ||||
@@ -783,7 +783,7 @@ ConvolutionBackwardDataImpl::get_algorithm_from_desc( | |||||
ConvolutionBackwardDataImpl::Algorithm* | ConvolutionBackwardDataImpl::Algorithm* | ||||
ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) { | ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) { | ||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { | |||||
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { | |||||
return algo; | return algo; | ||||
} | } | ||||
if (!m_prev_selected_algo || | if (!m_prev_selected_algo || | ||||
@@ -134,7 +134,7 @@ MatrixMul::Algorithm* MatrixMulImpl::get_algorithm_heuristic( | |||||
size_t workspace_limit_in_bytes, bool reproducible) { | size_t workspace_limit_in_bytes, bool reproducible) { | ||||
auto kern_size_param = make_kern_size_param(A, B, C); | auto kern_size_param = make_kern_size_param(A, B, C); | ||||
if (auto algo = static_cast<AlgoBase*>( | if (auto algo = static_cast<AlgoBase*>( | ||||
get_algorithm_from_desc(execution_policy().algo.desc))) { | |||||
get_algorithm_from_desc(execution_policy().algo))) { | |||||
megdnn_assert(algo->get_workspace(kern_size_param) < | megdnn_assert(algo->get_workspace(kern_size_param) < | ||||
workspace_limit_in_bytes); | workspace_limit_in_bytes); | ||||
auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo, | auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo, | ||||
@@ -382,7 +382,7 @@ float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||||
for (auto i : algos) { | for (auto i : algos) { | ||||
if (std::regex_match(i.name, | if (std::regex_match(i.name, | ||||
std::regex("(" + algo_base + ")(.*)"))) { | std::regex("(" + algo_base + ")(.*)"))) { | ||||
opr->execution_policy().algo = i; | |||||
opr->execution_policy().algo = i.desc; | |||||
auto used = benchmark.exec(layouts); | auto used = benchmark.exec(layouts); | ||||
min_used = std::min(min_used, used); | min_used = std::min(min_used, used); | ||||
printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(), | printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(), | ||||
@@ -242,6 +242,11 @@ public: | |||||
return *this; | return *this; | ||||
} | } | ||||
Checker& reset_before_exec_callback() { | |||||
m_before_exec_callback = nullptr; | |||||
return *this; | |||||
} | |||||
//! set a tensors constraints function, for the purpose of manipulating | //! set a tensors constraints function, for the purpose of manipulating | ||||
//! tensors when testing. | //! tensors when testing. | ||||
Checker& set_tensors_constraint( | Checker& set_tensors_constraint( | ||||
@@ -435,6 +440,17 @@ public: | |||||
Testcase operator=(const Testcase&) = delete; | Testcase operator=(const Testcase&) = delete; | ||||
}; | }; | ||||
struct ExecutionPolicyAlgoName { | |||||
std::string name; | |||||
std::vector<ExecutionPolicyAlgoName> sub_policy_names; | |||||
ExecutionPolicyAlgoName(const char* name) : name{name} {} | |||||
ExecutionPolicyAlgoName( | |||||
const char* name, | |||||
const std::vector<ExecutionPolicyAlgoName>& sub_policy) | |||||
: name{name}, sub_policy_names{sub_policy} {} | |||||
}; | |||||
/*! | /*! | ||||
* \brief a callable to check that given algorithm is used for heuristic | * \brief a callable to check that given algorithm is used for heuristic | ||||
* \param require_algo if its value is true, then requires | * \param require_algo if its value is true, then requires | ||||
@@ -444,48 +460,76 @@ public: | |||||
*/ | */ | ||||
template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>> | template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>> | ||||
class AlgoChecker { | class AlgoChecker { | ||||
std::string m_name; | |||||
typename Opr::Algorithm* m_algo = nullptr; | |||||
bool* m_require_algo; | |||||
public: | public: | ||||
AlgoChecker(const char* name, bool* require_algo = nullptr) | |||||
: m_name{name}, m_require_algo{require_algo} {} | |||||
AlgoChecker(typename Opr::Algorithm* algo, bool* require_algo = nullptr) | |||||
: m_algo{algo}, m_require_algo{require_algo} {} | |||||
AlgoChecker(ExecutionPolicyAlgoName name, bool* require_algo = nullptr) | |||||
: m_policy_name{name}, m_require_algo{require_algo} {} | |||||
AlgoChecker(ExecutionPolicy policy, bool* require_algo = nullptr) | |||||
: m_policy{policy}, m_require_algo{require_algo} {} | |||||
static ExecutionPolicy construct_execution_policy_from_name( | |||||
const ExecutionPolicyAlgoName& policy_name, | |||||
const TensorLayoutArray& layouts, const std::string& param, | |||||
Handle* handle) { | |||||
ExecutionPolicy ret; | |||||
megdnn_assert(layouts.size() == OprTrait<Opr>::arity); | |||||
auto opr = handle->create_operator<Opr>(); | |||||
opr->param() = | |||||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||||
for (auto algo_info : | |||||
AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info( | |||||
opr.get(), layouts)) { | |||||
if (std::regex_match( | |||||
algo_info.name, | |||||
std::regex("(" + policy_name.name + ")(.*)"))) { | |||||
ret.algo = algo_info.desc; | |||||
} else { | |||||
continue; | |||||
} | |||||
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); | |||||
std::vector<Algorithm::SearchItem>&& sub_items = | |||||
algo->get_subopr_list(layouts, opr.get()); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
ExecutionPolicy policy = | |||||
AlgoChecker<_Opr>::construct_execution_policy_from_name( | |||||
policy_name.sub_policy_names[_item_idx], | |||||
_item.layouts, _item.param, handle); | |||||
ret.sub_policy.push_back(policy); | |||||
}); | |||||
return ret; | |||||
} | |||||
return ret; | |||||
} | |||||
void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) { | void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) { | ||||
TensorLayoutArray layouts; | TensorLayoutArray layouts; | ||||
for (auto&& val : arr) { | for (auto&& val : arr) { | ||||
layouts.push_back(val.layout); | layouts.push_back(val.layout); | ||||
} | } | ||||
if (!m_policy_name.name.empty()) { | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(opr->param(), param_str); | |||||
m_policy = construct_execution_policy_from_name( | |||||
m_policy_name, layouts, param_str, opr->handle()); | |||||
ASSERT_TRUE(m_policy.algo.valid()) | |||||
<< "algorithm " << m_policy_name.name << " not found"; | |||||
} | |||||
if (m_require_algo && *m_require_algo) { | if (m_require_algo && *m_require_algo) { | ||||
auto algo = | auto algo = | ||||
OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts); | OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts); | ||||
if (m_name.empty()) { | |||||
ASSERT_EQ(m_algo->name(), algo.name.c_str()); | |||||
} else { | |||||
ASSERT_TRUE(std::regex_match( | |||||
algo.name.c_str(), std::regex("(" + m_name + ")(.*)"))); | |||||
} | |||||
ASSERT_STREQ(opr->get_algorithm_from_desc(m_policy.algo)->name(), | |||||
algo.name.c_str()); | |||||
} else { | } else { | ||||
if (m_name.empty()) { | |||||
opr->execution_policy().algo = m_algo->info(); | |||||
return; | |||||
} else { | |||||
for (auto i : | |||||
OprAlgoProxy::get_all_algorithms_info(opr, layouts)) { | |||||
if (std::regex_match(i.name, | |||||
std::regex("(" + m_name + ")(.*)"))) { | |||||
opr->execution_policy().algo = i; | |||||
return; | |||||
} | |||||
} | |||||
} | |||||
ASSERT_TRUE(false) << "algorithm " << m_name << " not found"; | |||||
opr->execution_policy() = m_policy; | |||||
} | } | ||||
} | } | ||||
private: | |||||
ExecutionPolicyAlgoName m_policy_name; | |||||
ExecutionPolicy m_policy; | |||||
bool* m_require_algo; | |||||
}; | }; | ||||
} // namespace test | } // namespace test | ||||
@@ -580,7 +580,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||||
checker.set_rng(0, &rng).set_rng(1, &rng); | checker.set_rng(0, &rng).set_rng(1, &rng); | ||||
for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { | for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { | ||||
used_algos.insert(algo.desc); | used_algos.insert(algo.desc); | ||||
opr->execution_policy().algo = algo; | |||||
opr->execution_policy().algo = algo.desc; | |||||
checker | checker | ||||
.set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) | .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) | ||||
.execs({ishp, fshp, {}}); | .execs({ishp, fshp, {}}); | ||||
@@ -599,7 +599,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||||
opr->param() = param; | opr->param() = param; | ||||
for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { | for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { | ||||
used_algos_bwd_data.insert(algo.desc); | used_algos_bwd_data.insert(algo.desc); | ||||
opr->execution_policy().algo = algo; | |||||
opr->execution_policy().algo = algo.desc; | |||||
checker_bwd_data | checker_bwd_data | ||||
.set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) | .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) | ||||
.execl({fly, oly, ily}); | .execl({fly, oly, ily}); | ||||
@@ -620,7 +620,7 @@ void convolution::test_conv_config_combinations(int k_size, | |||||
opr->param() = param; | opr->param() = param; | ||||
for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { | for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { | ||||
used_algos_bwd_flt.insert(algo.desc); | used_algos_bwd_flt.insert(algo.desc); | ||||
opr->execution_policy().algo = algo; | |||||
opr->execution_policy().algo = algo.desc; | |||||
checker_bwd_filter | checker_bwd_filter | ||||
.set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) | .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) | ||||
.execl({ily, oly, fly}); | .execl({ily, oly, fly}); | ||||
@@ -0,0 +1,47 @@ | |||||
/** | |||||
* \file dnn/test/common/fast_run_cache.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "test/common/fast_run_cache.h" | |||||
#include "src/common/utils.h" | |||||
using namespace megdnn; | |||||
using namespace test; | |||||
FastRunCache::SearchItemStorage::SearchItemStorage( | |||||
const Algorithm::SearchItem& item) { | |||||
Algorithm::serialize_write_pod(item.opr_type, data_hold); | |||||
for (auto&& layout : item.layouts) { | |||||
data_hold += layout.serialize(); | |||||
} | |||||
data_hold += item.param; | |||||
} | |||||
Algorithm::Info::Desc FastRunCache::get(const Algorithm::SearchItem& key) { | |||||
SearchItemStorage key_storage(key); | |||||
key_storage.init_hash(); | |||||
auto iter = m_cache.find(key_storage); | |||||
if (iter == m_cache.end()) { | |||||
return {}; | |||||
} | |||||
return iter->second; | |||||
} | |||||
void FastRunCache::put(const Algorithm::SearchItem& key, | |||||
const Algorithm::Info::Desc& val) { | |||||
SearchItemStorage key_storage(key); | |||||
key_storage.init_hash(); | |||||
megdnn_assert(m_cache.find(key_storage) == m_cache.end()); | |||||
m_cache[std::move(key_storage)] = val; | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,58 @@ | |||||
/** | |||||
* \file dnn/test/common/fast_run_cache.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "megdnn/oprs.h" | |||||
#include "src/common/hash_ct.h" | |||||
#include <unordered_map> | |||||
namespace megdnn { | |||||
namespace test { | |||||
class FastRunCache { | |||||
struct SearchItemStorage { | |||||
std::string data_hold; | |||||
size_t hash = 0; | |||||
SearchItemStorage(const Algorithm::SearchItem& item); | |||||
SearchItemStorage& init_hash() { | |||||
hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), | |||||
20201225); | |||||
return *this; | |||||
} | |||||
bool operator==(const SearchItemStorage& rhs) const { | |||||
return data_hold == rhs.data_hold; | |||||
} | |||||
struct Hash { | |||||
size_t operator()(const SearchItemStorage& s) const { | |||||
return s.hash; | |||||
} | |||||
}; | |||||
}; | |||||
std::unordered_map<SearchItemStorage, Algorithm::Info::Desc, | |||||
SearchItemStorage::Hash> | |||||
m_cache; | |||||
public: | |||||
Algorithm::Info::Desc get(const Algorithm::SearchItem& key); | |||||
void put(const Algorithm::SearchItem& key, | |||||
const Algorithm::Info::Desc& val); | |||||
}; | |||||
} // namespace test | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -13,6 +13,7 @@ | |||||
#include "test/common/deduce_layout_proxy.h" | #include "test/common/deduce_layout_proxy.h" | ||||
#include "test/common/exec_proxy.h" | #include "test/common/exec_proxy.h" | ||||
#include "test/common/fast_run_cache.h" | |||||
#include "test/common/inspect_type.h" | #include "test/common/inspect_type.h" | ||||
#include "test/common/opr_algo_proxy.h" | #include "test/common/opr_algo_proxy.h" | ||||
#include "test/common/opr_trait.h" | #include "test/common/opr_trait.h" | ||||
@@ -20,11 +21,104 @@ | |||||
#include "test/common/workspace_wrapper.h" | #include "test/common/workspace_wrapper.h" | ||||
#include <algorithm> | #include <algorithm> | ||||
#include <limits> | |||||
#include <memory> | #include <memory> | ||||
#include <unordered_map> | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace test { | namespace test { | ||||
template <Algorithm::OprType> | |||||
struct OprFromOprTypeTrait; | |||||
template <typename Opr> | |||||
struct OprTypeFromOprTrait; | |||||
#define cb(_opr_type, _opr) \ | |||||
template <> \ | |||||
struct OprFromOprTypeTrait<Algorithm::OprType::_opr_type> { \ | |||||
using Opr = megdnn::_opr; \ | |||||
}; \ | |||||
template <> \ | |||||
struct OprTypeFromOprTrait<megdnn::_opr> { \ | |||||
constexpr static Algorithm::OprType opr_type = \ | |||||
Algorithm::OprType::_opr_type; \ | |||||
} | |||||
cb(MATRIX_MUL_FORWARD, MatrixMulForward); | |||||
cb(CONVOLUTION_FORWARD, ConvolutionForward); | |||||
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); | |||||
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); | |||||
cb(CONVOLUTION3D_FORWARD, Convolution3DForward); | |||||
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); | |||||
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); | |||||
cb(LOCAL_SHARE_FORWARD, LocalShareForward); | |||||
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); | |||||
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); | |||||
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); | |||||
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); | |||||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | |||||
cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | |||||
cb(CONVBIAS_FORWARD, ConvBiasForward); | |||||
#undef cb | |||||
// clang-format off | |||||
#define FOREACH_OPR_TYPE(cb) \ | |||||
cb(MATRIX_MUL_FORWARD) \ | |||||
cb(CONVOLUTION_FORWARD) \ | |||||
cb(CONVOLUTION_BACKWARD_DATA) \ | |||||
cb(CONVOLUTION_BACKWARD_FILTER) \ | |||||
cb(CONVOLUTION3D_FORWARD) \ | |||||
cb(CONVOLUTION3D_BACKWARD_DATA) \ | |||||
cb(CONVOLUTION3D_BACKWARD_FILTER) \ | |||||
cb(LOCAL_SHARE_FORWARD) \ | |||||
cb(LOCAL_SHARE_BACKWARD_DATA) \ | |||||
cb(LOCAL_SHARE_BACKWARD_FILTER) \ | |||||
cb(DEFORMABLE_CONV_FORWARD) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_DATA) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_FILTER) \ | |||||
cb(BATCH_CONV_FORWARD) \ | |||||
cb(CONVBIAS_FORWARD) | |||||
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ | |||||
cb(MATRIX_MUL_FORWARD, stmt) \ | |||||
cb(CONVOLUTION_FORWARD, stmt) \ | |||||
cb(CONVOLUTION_BACKWARD_DATA, stmt) \ | |||||
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ | |||||
cb(CONVOLUTION3D_FORWARD, stmt) \ | |||||
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ | |||||
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ | |||||
cb(LOCAL_SHARE_FORWARD, stmt) \ | |||||
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ | |||||
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ | |||||
cb(DEFORMABLE_CONV_FORWARD, stmt) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | |||||
cb(BATCH_CONV_FORWARD, stmt) \ | |||||
cb(CONVBIAS_FORWARD, stmt) | |||||
// clang-format on | |||||
#define _OPR_TYPE_CASE(_opr_type, _stmt) \ | |||||
case Algorithm::OprType::_opr_type: { \ | |||||
using _Opr = typename OprFromOprTypeTrait< \ | |||||
Algorithm::OprType::_opr_type>::Opr; \ | |||||
_stmt; \ | |||||
break; \ | |||||
} | |||||
#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ | |||||
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ | |||||
_item_idx++) { \ | |||||
auto&& _item = _search_items[_item_idx]; \ | |||||
switch (_item.opr_type) { \ | |||||
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ | |||||
default: \ | |||||
megdnn_throw("unknown opr_type"); \ | |||||
} \ | |||||
} | |||||
template <typename Opr, size_t arity = OprTrait<Opr>::arity, | template <typename Opr, size_t arity = OprTrait<Opr>::arity, | ||||
bool has_workspace = OprTrait<Opr>::has_workspace, | bool has_workspace = OprTrait<Opr>::has_workspace, | ||||
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | ||||
@@ -130,10 +224,11 @@ struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | |||||
}; | }; | ||||
//! OprProxy impl for tenary oprs with profiling support | //! OprProxy impl for tenary oprs with profiling support | ||||
template <class Opr, int arity> | |||||
template <class Opr> | |||||
struct OprProxyProfilingBase | struct OprProxyProfilingBase | ||||
: public DeduceLayoutProxy<Opr, arity, | |||||
: public DeduceLayoutProxy<Opr, OprTrait<Opr>::arity, | |||||
OprTrait<Opr>::can_deduce_layout> { | OprTrait<Opr>::can_deduce_layout> { | ||||
static constexpr int arity = OprTrait<Opr>::arity; | |||||
size_t warmup_times = 10, exec_times = 100; | size_t warmup_times = 10, exec_times = 100; | ||||
//! whether to enable profiling | //! whether to enable profiling | ||||
@@ -142,7 +237,7 @@ struct OprProxyProfilingBase | |||||
//! target algo setup by profiler; it can also be directly specified by the | //! target algo setup by profiler; it can also be directly specified by the | ||||
//! caller | //! caller | ||||
typename Opr::AlgorithmInfo target_algo_info; | |||||
ExecutionPolicy target_execution_policy; | |||||
OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | ||||
@@ -168,6 +263,154 @@ struct OprProxyProfilingBase | |||||
return ret; | return ret; | ||||
} | } | ||||
/** | |||||
* flatten search space in postorder traversal | |||||
* The subopr search construct a search tree | |||||
* | |||||
* A | |||||
* / \ | |||||
* B1B2 C | |||||
* / \ | |||||
* D1D2D3 E | |||||
* We use postorder traverse the search tree. | |||||
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A | |||||
*/ | |||||
static std::vector<Algorithm::SearchItem> flatten_search_space( | |||||
const TensorLayoutArray layouts, const std::string& param, | |||||
Handle* handle) { | |||||
megdnn_assert(layouts.size() == arity); | |||||
auto opr = handle->create_operator<Opr>(); | |||||
opr->param() = | |||||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||||
std::vector<Algorithm::SearchItem> ret; | |||||
for (auto algo_info : AlgoProxy<Opr, arity>::get_all_algorithms_info( | |||||
opr.get(), layouts)) { | |||||
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); | |||||
std::vector<Algorithm::SearchItem>&& sub_items = | |||||
algo->get_subopr_list(layouts, opr.get()); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
auto space = OprProxyProfilingBase<_Opr>::flatten_search_space( | |||||
_item.layouts, _item.param, handle); | |||||
ret.insert(ret.end(), space.begin(), space.end()); | |||||
}); | |||||
} | |||||
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||||
return ret; | |||||
} | |||||
static void construct_execution_policy( | |||||
const TensorLayoutArray& layouts, const std::string& param, | |||||
Handle* handle, FastRunCache& cache, | |||||
ExecutionPolicy& policy) { | |||||
megdnn_assert(layouts.size() == arity); | |||||
auto opr = handle->create_operator<Opr>(); | |||||
opr->param() = | |||||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||||
if (!policy.algo.valid()) { | |||||
policy.algo = cache.get(Algorithm::SearchItem{ | |||||
OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||||
megdnn_assert(policy.algo.valid(), | |||||
"No cache found, maybe some error occured in " | |||||
"flatten_search_space or get_subopr_list"); | |||||
} | |||||
policy.sub_policy.clear(); | |||||
Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); | |||||
std::vector<Algorithm::SearchItem>&& sub_items = | |||||
algo->get_subopr_list(layouts, opr.get()); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
policy.sub_policy.push_back({}); | |||||
OprProxyProfilingBase<_Opr>::construct_execution_policy( | |||||
_item.layouts, _item.param, handle, cache, | |||||
policy.sub_policy.back()); | |||||
}); | |||||
return; | |||||
} | |||||
/** | |||||
* \brief search and get the best execution_policy | |||||
*/ | |||||
static void search(const TensorLayoutArray& layouts, | |||||
const std::string& param, | |||||
WorkspaceWrapper& workspace_wrapper, Handle* handle, | |||||
size_t warmup_times, size_t exec_times, | |||||
FastRunCache& cache) { | |||||
megdnn_assert(layouts.size() == arity); | |||||
auto opr = handle->create_operator<Opr>(); | |||||
opr->param() = | |||||
Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||||
SmallVector<size_t> sizes_in_bytes; | |||||
for (const auto& layout : layouts) { | |||||
sizes_in_bytes.push_back(layout.span().dist_byte()); | |||||
} | |||||
float min_time = std::numeric_limits<float>::max(); | |||||
Algorithm::Info::Desc best_algo; | |||||
std::string log_info = "Profiling start: "; | |||||
for (auto&& layout : layouts) { | |||||
log_info += layout.to_string() + " "; | |||||
} | |||||
megdnn_log("%s", log_info.c_str()); | |||||
best_algo = cache.get(Algorithm::SearchItem{ | |||||
OprTypeFromOprTrait<Opr>::opr_type, param, layouts}); | |||||
if (best_algo.valid()) { | |||||
auto&& algo = opr->get_algorithm_from_desc(best_algo); | |||||
MEGDNN_MARK_USED_VAR(algo); | |||||
megdnn_log("Find best algo %s in cache", algo->name()); | |||||
return; | |||||
} | |||||
for (auto algo : AlgoProxy<Opr, arity>::get_all_algorithms_info( | |||||
opr.get(), layouts)) { | |||||
//! construct execution_policy | |||||
opr->execution_policy().algo = algo.desc; | |||||
construct_execution_policy(layouts, param, handle, cache, | |||||
opr->execution_policy()); | |||||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||||
opr.get(), layouts); | |||||
sizes_in_bytes.push_back(workspace_size); | |||||
WorkspaceBundle wb(nullptr, sizes_in_bytes); | |||||
workspace_wrapper.update(wb.total_size_in_bytes()); | |||||
wb.set(workspace_wrapper.workspace().raw_ptr); | |||||
TensorNDArray tensors; | |||||
for (size_t i = 0; i < arity; i++) { | |||||
tensors.push_back({wb.get(i), layouts[i]}); | |||||
} | |||||
for (size_t times = 0; times < warmup_times; ++times) { | |||||
AlgoProxy<Opr, arity>::exec(opr.get(), tensors, | |||||
wb.get_workspace(arity)); | |||||
} | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
Timer timer; | |||||
timer.start(); | |||||
for (size_t times = 0; times < exec_times; ++times) { | |||||
AlgoProxy<Opr, arity>::exec(opr.get(), tensors, | |||||
wb.get_workspace(arity)); | |||||
} | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
timer.stop(); | |||||
megdnn_log("%.3fms %s", timer.get_time_in_us() / 1e3, | |||||
algo.name.c_str()); | |||||
if (min_time > timer.get_time_in_us()) { | |||||
min_time = timer.get_time_in_us(); | |||||
best_algo = algo.desc; | |||||
} | |||||
sizes_in_bytes.pop_back(); | |||||
} | |||||
auto&& algo = opr->get_algorithm_from_desc(best_algo); | |||||
MEGDNN_MARK_USED_VAR(algo); | |||||
megdnn_log("Profiling end, got best algo: %s", algo->name()); | |||||
cache.put(Algorithm::SearchItem{OprTypeFromOprTrait<Opr>::opr_type, | |||||
param, layouts}, | |||||
best_algo); | |||||
} | |||||
void exec(Opr* opr, const TensorNDArray& tensors) { | void exec(Opr* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
if (!W.valid()) { | if (!W.valid()) { | ||||
@@ -177,39 +420,26 @@ struct OprProxyProfilingBase | |||||
for (auto&& tensor : tensors) { | for (auto&& tensor : tensors) { | ||||
layouts.push_back(tensor.layout); | layouts.push_back(tensor.layout); | ||||
} | } | ||||
if (m_profiling && !target_algo_info.valid()) { | |||||
size_t min_time = std::numeric_limits<size_t>::max(); | |||||
for (auto algo : | |||||
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { | |||||
opr->execution_policy().algo = algo; | |||||
auto workspace_size = | |||||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, | |||||
layouts); | |||||
W.update(workspace_size); | |||||
for (size_t times = 0; times < warmup_times; ++times) | |||||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
Timer timer; | |||||
timer.start(); | |||||
for (size_t times = 0; times < exec_times; ++times) { | |||||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||||
} | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
timer.stop(); | |||||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||||
algo.name.c_str()); | |||||
if (min_time > timer.get_time_in_us()) { | |||||
min_time = timer.get_time_in_us(); | |||||
target_algo_info = algo; | |||||
} | |||||
} | |||||
opr->execution_policy().algo = target_algo_info; | |||||
if (m_profiling && !target_execution_policy.algo.valid()) { | |||||
FastRunCache cache; | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(opr->param(), param_str); | |||||
auto&& search_items = | |||||
flatten_search_space(layouts, param_str, opr->handle()); | |||||
FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||||
OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W, | |||||
opr->handle(), warmup_times, | |||||
exec_times, cache); | |||||
}); | |||||
construct_execution_policy(layouts, param_str, opr->handle(), cache, | |||||
opr->execution_policy()); | |||||
target_execution_policy = opr->execution_policy(); | |||||
auto workspace_size = | auto workspace_size = | ||||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | ||||
W.update(workspace_size); | W.update(workspace_size); | ||||
} | } | ||||
if (!target_algo_info.valid()) { | |||||
if (!target_execution_policy.algo.valid()) { | |||||
auto workspace_size = | auto workspace_size = | ||||
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); | ||||
W.update(workspace_size); | W.update(workspace_size); | ||||
@@ -218,30 +448,32 @@ struct OprProxyProfilingBase | |||||
} | } | ||||
}; | }; | ||||
#define DEF_PROF(c, arity) \ | |||||
template <> \ | |||||
struct OprProxy<c> : public OprProxyProfilingBase<c, arity> { \ | |||||
using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \ | |||||
#define DEF_PROF(c) \ | |||||
template <> \ | |||||
struct OprProxy<c> : public OprProxyProfilingBase<c> { \ | |||||
using OprProxyProfilingBase<c>::OprProxyProfilingBase; \ | |||||
} | } | ||||
DEF_PROF(ConvolutionForward, 3); | |||||
DEF_PROF(ConvolutionBackwardData, 3); | |||||
DEF_PROF(ConvolutionBackwardFilter, 3); | |||||
DEF_PROF(LocalShareForward, 3); | |||||
DEF_PROF(LocalShareBackwardData, 3); | |||||
DEF_PROF(LocalShareBackwardFilter, 3); | |||||
DEF_PROF(MatrixMulForward); | |||||
DEF_PROF(ConvolutionForward); | |||||
DEF_PROF(ConvolutionBackwardData); | |||||
DEF_PROF(ConvolutionBackwardFilter); | |||||
DEF_PROF(LocalShareForward); | |||||
DEF_PROF(LocalShareBackwardData); | |||||
DEF_PROF(LocalShareBackwardFilter); | |||||
DEF_PROF(DeformableConvForward, 5); | |||||
DEF_PROF(DeformableConvBackwardFilter, 5); | |||||
DEF_PROF(BatchConvBiasForward, 5); | |||||
DEF_PROF(ConvBiasForward, 5); | |||||
DEF_PROF(DeformableConvForward); | |||||
DEF_PROF(DeformableConvBackwardFilter); | |||||
DEF_PROF(BatchConvBiasForward); | |||||
DEF_PROF(ConvBiasForward); | |||||
DEF_PROF(DeformableConvBackwardData, 8); | |||||
DEF_PROF(DeformableConvBackwardData); | |||||
#undef DEF_PROF | #undef DEF_PROF | ||||
template <class Opr, int arity> | |||||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||||
using Base = OprProxyProfilingBase<Opr, arity>; | |||||
template <class Opr> | |||||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||||
using Base = OprProxyProfilingBase<Opr>; | |||||
static constexpr int arity = OprTrait<Opr>::arity; | |||||
void exec(Opr* opr, const TensorNDArray& tensors) { | void exec(Opr* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
if (!Base::W.valid()) { | if (!Base::W.valid()) { | ||||
@@ -252,11 +484,11 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||||
for (auto&& tensor : tensors) { | for (auto&& tensor : tensors) { | ||||
layouts.push_back(tensor.layout); | layouts.push_back(tensor.layout); | ||||
} | } | ||||
if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { | |||||
if (Base::m_profiling && !Base::target_execution_policy.algo.valid()) { | |||||
size_t min_time = std::numeric_limits<size_t>::max(); | size_t min_time = std::numeric_limits<size_t>::max(); | ||||
for (auto algo : | for (auto algo : | ||||
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { | AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { | ||||
opr->execution_policy().algo = algo; | |||||
opr->execution_policy().algo = algo.desc; | |||||
auto preprocess_tensors = | auto preprocess_tensors = | ||||
weight_prerocess(opr, tensors, algo.desc); | weight_prerocess(opr, tensors, algo.desc); | ||||
@@ -288,12 +520,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||||
algo.name.c_str()); | algo.name.c_str()); | ||||
if (min_time > timer.get_time_in_us()) { | if (min_time > timer.get_time_in_us()) { | ||||
min_time = timer.get_time_in_us(); | min_time = timer.get_time_in_us(); | ||||
Base::target_algo_info = algo; | |||||
Base::target_execution_policy.algo = algo.desc; | |||||
} | } | ||||
} | } | ||||
opr->execution_policy().algo = Base::target_algo_info; | |||||
auto preprocess_tensors = | |||||
weight_prerocess(opr, tensors, Base::target_algo_info.desc); | |||||
opr->execution_policy() = Base::target_execution_policy; | |||||
auto preprocess_tensors = weight_prerocess( | |||||
opr, tensors, Base::target_execution_policy.algo); | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | megcoreSynchronize(opr->handle()->megcore_computing_handle()); | ||||
typename Opr::PreprocessedFilter preprocessed_filter{ | typename Opr::PreprocessedFilter preprocessed_filter{ | ||||
nullptr, *preprocess_tensors}; | nullptr, *preprocess_tensors}; | ||||
@@ -301,12 +533,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||||
opr, layouts, &preprocessed_filter); | opr, layouts, &preprocessed_filter); | ||||
Base::W.update(workspace_size); | Base::W.update(workspace_size); | ||||
} | } | ||||
auto preprocess_tensors = | |||||
weight_prerocess(opr, tensors, Base::target_algo_info.desc); | |||||
auto preprocess_tensors = weight_prerocess( | |||||
opr, tensors, Base::target_execution_policy.algo); | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | megcoreSynchronize(opr->handle()->megcore_computing_handle()); | ||||
typename Opr::PreprocessedFilter preprocessed_filter{ | typename Opr::PreprocessedFilter preprocessed_filter{ | ||||
nullptr, *preprocess_tensors}; | nullptr, *preprocess_tensors}; | ||||
if (!Base::target_algo_info.valid()) { | |||||
if (!Base::target_execution_policy.algo.valid()) { | |||||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | ||||
opr, layouts, &preprocessed_filter); | opr, layouts, &preprocessed_filter); | ||||
Base::W.update(workspace_size); | Base::W.update(workspace_size); | ||||
@@ -342,16 +574,15 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> { | |||||
} | } | ||||
}; | }; | ||||
#define DEF_PROF(c, arity) \ | |||||
template <> \ | |||||
struct OprWeightPreprocessProxy<c> \ | |||||
: public OprWeightPreprocessProxyImpl<c, arity> { \ | |||||
using OprWeightPreprocessProxyImpl< \ | |||||
c, arity>::OprWeightPreprocessProxyImpl; \ | |||||
#define DEF_PROF(c) \ | |||||
template <> \ | |||||
struct OprWeightPreprocessProxy<c> \ | |||||
: public OprWeightPreprocessProxyImpl<c> { \ | |||||
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | |||||
} | } | ||||
DEF_PROF(ConvolutionForward, 3); | |||||
DEF_PROF(ConvBias, 5); | |||||
DEF_PROF(ConvolutionForward); | |||||
DEF_PROF(ConvBias); | |||||
#undef DEF_PROF | #undef DEF_PROF | ||||
} // namespace test | } // namespace test | ||||
@@ -279,7 +279,7 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||||
benchmarker.set_param(bparam); | benchmarker.set_param(bparam); | ||||
if (!algo) { | if (!algo) { | ||||
benchmarker.proxy()->target_algo_info.reset(); | |||||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||||
} | } | ||||
auto time_in_ms = | auto time_in_ms = | ||||
benchmarker.execs( | benchmarker.execs( | ||||
@@ -514,7 +514,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) { | |||||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | ||||
size_t FW) { | size_t FW) { | ||||
checker.proxy()->target_algo_info.reset(); | |||||
checker.proxy()->target_execution_policy.algo.reset(); | |||||
checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); | checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); | ||||
}; | }; | ||||
@@ -538,7 +538,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) { | |||||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | ||||
size_t FW) { | size_t FW) { | ||||
checker.proxy()->target_algo_info.reset(); | |||||
checker.proxy()->target_execution_policy.algo.reset(); | |||||
checker.execs({{C, 1, 1, FH, FW}, | checker.execs({{C, 1, 1, FH, FW}, | ||||
{N, C, IH - FH + 1, IW - FW + 1}, | {N, C, IH - FH + 1, IW - FW + 1}, | ||||
{N, C, IH, IW}}); | {N, C, IH, IW}}); | ||||
@@ -564,7 +564,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) { | |||||
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | ||||
size_t FW) { | size_t FW) { | ||||
checker.proxy()->target_algo_info.reset(); | |||||
checker.proxy()->target_execution_policy.algo.reset(); | |||||
checker.execs({{N, C, IH, IW}, | checker.execs({{N, C, IH, IW}, | ||||
{N, C, IH - FH + 1, IW - FW + 1}, | {N, C, IH - FH + 1, IW - FW + 1}, | ||||
{C, 1, 1, FH, FW}}); | {C, 1, 1, FH, FW}}); | ||||
@@ -614,7 +614,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | ||||
bencher.set_param(param) | bencher.set_param(param) | ||||
@@ -623,10 +623,10 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||||
.set_dtype(2, dtype::Float16()) | .set_dtype(2, dtype::Float16()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | ||||
bencher.set_param(param); | bencher.set_param(param); | ||||
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | ||||
@@ -168,7 +168,7 @@ void benchmark_target_algo( | |||||
benchmarker.set_param(param); | benchmarker.set_param(param); | ||||
if (!algo) { | if (!algo) { | ||||
benchmarker.proxy()->target_algo_info.reset(); | |||||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||||
} | } | ||||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | ||||
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | ||||
@@ -327,7 +327,7 @@ void benchmark_target_algo_with_cudnn_tsc( | |||||
benchmarker.set_param(param); | benchmarker.set_param(param); | ||||
if (!algo) { | if (!algo) { | ||||
benchmarker.proxy()->target_algo_info.reset(); | |||||
benchmarker.proxy()->target_execution_policy.algo.reset(); | |||||
} | } | ||||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | ||||
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, | ||||
@@ -8,6 +8,7 @@ | |||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
*/ | */ | ||||
#include "megdnn/dtype.h" | |||||
#include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
#include "megdnn/opr_param_defs.h" | #include "megdnn/opr_param_defs.h" | ||||
#include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
@@ -223,14 +224,19 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) | |||||
.set_epsilon(1e-1) | .set_epsilon(1e-1) | ||||
.set_param(arg.param) | .set_param(arg.param) | ||||
.exec(TensorLayoutArray{filter, dst, src}); | .exec(TensorLayoutArray{filter, dst, src}); | ||||
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); | |||||
checker. | |||||
set_rng(0, &rng). | |||||
set_rng(1, &rng). | |||||
set_epsilon(1e-1). | |||||
set_param(arg.param). | |||||
exec(TensorLayoutArray{filter, dst, src}); | |||||
} | } | ||||
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( | |||||
ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16", | |||||
{{"MATMUL", {}}}})); | |||||
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); | |||||
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
checker.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_epsilon(1e-1) | |||||
.set_param(arg.param) | |||||
.exec(TensorLayoutArray{filter, dst, src}); | |||||
checker.reset_before_exec_callback(); | |||||
checker.opr()->execution_policy() = {}; | |||||
} | } | ||||
} | } | ||||
@@ -382,32 +388,35 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) { | |||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, CONV_FWD_BENCHMARK) { | TEST_F(CUDA, CONV_FWD_BENCHMARK) { | ||||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH=1, | |||||
size_t SW=1, size_t FH=1, size_t FW=1, size_t PH=0, size_t PW=0, bool fp16io_c32=false) { | |||||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, | |||||
size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1, | |||||
size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) { | |||||
auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda()); | auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda()); | ||||
benchmarker.set_dtype(0, dtype::Float16()) | benchmarker.set_dtype(0, dtype::Float16()) | ||||
.set_dtype(1, dtype::Float16()) | |||||
.set_dtype(2, dtype::Float16()); | |||||
.set_dtype(1, dtype::Float16()) | |||||
.set_dtype(2, dtype::Float16()); | |||||
ConvolutionForward::Param param; | ConvolutionForward::Param param; | ||||
param.stride_h = SH; | param.stride_h = SH; | ||||
param.stride_w = SW; | param.stride_w = SW; | ||||
param.pad_h = PH; | param.pad_h = PH; | ||||
param.pad_w = PW; | param.pad_w = PW; | ||||
if (fp16io_c32) { | if (fp16io_c32) { | ||||
param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32; | |||||
param.compute_mode = | |||||
ConvolutionForward::Param::ComputeMode::FLOAT32; | |||||
} | } | ||||
benchmarker.set_param(param); | benchmarker.set_param(param); | ||||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}}; | |||||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||||
new OprProxy<ConvolutionForward>{true}}; | |||||
benchmarker.set_proxy(proxy); | benchmarker.set_proxy(proxy); | ||||
size_t OH = (IH - FH + 2 * PH) / SH + 1; | size_t OH = (IH - FH + 2 * PH) / SH + 1; | ||||
size_t OW = (IW - FW + 2 * PW) / SW + 1; | size_t OW = (IW - FW + 2 * PW) / SW + 1; | ||||
auto time = benchmarker.execs({ | |||||
{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); | |||||
auto time = benchmarker.execs( | |||||
{{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); | |||||
time /= 1000.0 * 10.0; | time /= 1000.0 * 10.0; | ||||
auto flo = (double) N * OC * IC * OH * OW * FH * FW * 2; | |||||
auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2; | |||||
auto flops = flo / time / 1e12; | auto flops = flo / time / 1e12; | ||||
printf("comp_type %s: ", fp16io_c32 ? "32" : "16"); | printf("comp_type %s: ", fp16io_c32 ? "32" : "16"); | ||||
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo/1e9, flops); | |||||
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops); | |||||
}; | }; | ||||
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false); | run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false); | ||||
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true); | run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true); | ||||
@@ -415,7 +424,8 @@ TEST_F(CUDA, CONV_FWD_BENCHMARK) { | |||||
TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | ||||
CUBenchmarker<ConvolutionForward> bench{handle_cuda()}; | CUBenchmarker<ConvolutionForward> bench{handle_cuda()}; | ||||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}}; | |||||
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||||
new OprProxy<ConvolutionForward>{true}}; | |||||
size_t RUNS = 10; | size_t RUNS = 10; | ||||
bench.set_proxy(proxy).set_times(RUNS); | bench.set_proxy(proxy).set_times(RUNS); | ||||
@@ -429,7 +439,7 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | |||||
param.pad_h = param.pad_w = PH; | param.pad_h = param.pad_w = PH; | ||||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | ||||
bench.set_param(param); | bench.set_param(param); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | ||||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | filter{{OC, IC, FH, FH}, dtype::Float32()}; | ||||
TensorLayout dst; | TensorLayout dst; | ||||
@@ -440,13 +450,13 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { | |||||
} | } | ||||
auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS; | auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS; | ||||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_dtype(0, dtype::Float16()) | bench.set_dtype(0, dtype::Float16()) | ||||
.set_dtype(1, dtype::Float16()) | .set_dtype(1, dtype::Float16()) | ||||
.set_dtype(2, dtype::Float16()); | .set_dtype(2, dtype::Float16()); | ||||
auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS; | auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS; | ||||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_param(param); | bench.set_param(param); | ||||
auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS; | auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS; | ||||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | ||||
@@ -500,7 +510,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||||
param.pad_h = param.pad_w = PH; | param.pad_h = param.pad_w = PH; | ||||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | ||||
bench.set_param(param); | bench.set_param(param); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | ||||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | filter{{OC, IC, FH, FH}, dtype::Float32()}; | ||||
TensorLayout dst; | TensorLayout dst; | ||||
@@ -511,13 +521,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||||
} | } | ||||
auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS; | auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS; | ||||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_dtype(0, dtype::Float16()) | bench.set_dtype(0, dtype::Float16()) | ||||
.set_dtype(1, dtype::Float16()) | .set_dtype(1, dtype::Float16()) | ||||
.set_dtype(2, dtype::Float16()); | .set_dtype(2, dtype::Float16()); | ||||
auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS; | auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS; | ||||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_param(param); | bench.set_param(param); | ||||
auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS; | auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS; | ||||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | ||||
@@ -554,6 +564,62 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||||
run(32, 64, 64, 56, 56, 1, 1, 0); | run(32, 64, 64, 56, 56, 1, 1, 0); | ||||
} | } | ||||
TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { | |||||
CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()}; | |||||
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{ | |||||
new OprProxy<ConvolutionBackwardData>{true}}; | |||||
size_t RUNS = 10; | |||||
bench.set_proxy(proxy).set_times(RUNS); | |||||
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, | |||||
size_t FH, size_t SH, size_t PH) { | |||||
bench.set_dtype(0, dtype::BFloat16()) | |||||
.set_dtype(1, dtype::BFloat16()) | |||||
.set_dtype(2, dtype::BFloat16()); | |||||
param::Convolution param; | |||||
param.stride_h = param.stride_w = SH; | |||||
param.pad_h = param.pad_w = PH; | |||||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
bench.set_param(param); | |||||
bench.proxy()->target_execution_policy = {}; | |||||
TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()}, | |||||
filter{{OC, IC, FH, FH}, dtype::BFloat16()}; | |||||
TensorLayout dst; | |||||
{ | |||||
auto&& opr = handle_cuda()->create_operator<Convolution>(); | |||||
opr->param() = param; | |||||
opr->deduce_layout(src, filter, dst); | |||||
} | |||||
auto used = bench.execl({filter, dst, src}) / RUNS; | |||||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | |||||
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), | |||||
filter.to_string().c_str(), dst.to_string().c_str()); | |||||
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, | |||||
(flo / (used * 1e9))); | |||||
}; | |||||
run(32, 64, 3, 224, 224, 7, 2, 3); | |||||
run(32, 128, 128, 28, 28, 3, 1, 1); | |||||
run(32, 256, 256, 14, 14, 3, 1, 1); | |||||
run(32, 512, 512, 7, 7, 3, 1, 1); | |||||
run(32, 64, 64, 56, 56, 3, 1, 1); | |||||
run(32, 512, 256, 56, 56, 1, 2, 0); | |||||
run(32, 1024, 512, 28, 28, 1, 2, 0); | |||||
run(32, 2048, 1024, 14, 14, 1, 2, 0); | |||||
run(32, 512, 128, 28, 28, 1, 1, 0); | |||||
run(32, 128, 512, 28, 28, 1, 1, 0); | |||||
run(32, 1024, 256, 14, 14, 1, 1, 0); | |||||
run(32, 256, 1024, 14, 14, 1, 1, 0); | |||||
run(32, 2048, 512, 7, 7, 1, 1, 0); | |||||
run(32, 512, 2048, 7, 7, 1, 1, 0); | |||||
run(32, 256, 64, 56, 56, 1, 1, 0); | |||||
run(32, 64, 256, 56, 56, 1, 1, 0); | |||||
run(32, 128, 256, 56, 56, 1, 2, 0); | |||||
run(32, 256, 512, 28, 28, 1, 2, 0); | |||||
run(32, 512, 1024, 14, 14, 1, 2, 0); | |||||
run(32, 64, 64, 56, 56, 1, 1, 0); | |||||
} | |||||
TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | ||||
CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()}; | CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()}; | ||||
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | ||||
@@ -571,7 +637,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||||
param.pad_h = param.pad_w = PH; | param.pad_h = param.pad_w = PH; | ||||
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | ||||
bench.set_param(param); | bench.set_param(param); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, | ||||
filter{{OC, IC, FH, FH}, dtype::Float32()}; | filter{{OC, IC, FH, FH}, dtype::Float32()}; | ||||
TensorLayout dst; | TensorLayout dst; | ||||
@@ -582,13 +648,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||||
} | } | ||||
auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; | auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; | ||||
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | src.dtype = filter.dtype = dst.dtype = dtype::Float16(); | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_dtype(0, dtype::Float16()) | bench.set_dtype(0, dtype::Float16()) | ||||
.set_dtype(1, dtype::Float16()) | .set_dtype(1, dtype::Float16()) | ||||
.set_dtype(2, dtype::Float16()); | .set_dtype(2, dtype::Float16()); | ||||
auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS; | auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS; | ||||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | ||||
bench.proxy()->target_algo_info.reset(); | |||||
bench.proxy()->target_execution_policy.algo.reset(); | |||||
bench.set_param(param); | bench.set_param(param); | ||||
auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS; | auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS; | ||||
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; | ||||
@@ -630,8 +696,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||||
#undef V | #undef V | ||||
#undef V1 | #undef V1 | ||||
} // namespace test | |||||
} // namespace megdnn | |||||
} // namespace test | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -778,7 +778,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS; | auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS; | ||||
printf("src=%s, diff=%s, grad=%s, float32: %.2fms " | printf("src=%s, diff=%s, grad=%s, float32: %.2fms " | ||||
@@ -856,7 +856,7 @@ TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | ||||
; | ; | ||||
@@ -915,7 +915,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS; | auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS; | ||||
printf("filter=%s, diff=%s, grad=%s, float32: %.2fms " | printf("filter=%s, diff=%s, grad=%s, float32: %.2fms " | ||||
@@ -1002,11 +1002,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | ||||
bencher_conv.set_param(conv_param); | bencher_conv.set_param(conv_param); | ||||
bencher_conv.proxy()->target_algo_info.reset(); | |||||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms_conv = | auto time_in_ms_conv = | ||||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | ||||
@@ -1094,11 +1094,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | ||||
bencher_conv.set_param(conv_param); | bencher_conv.set_param(conv_param); | ||||
bencher_conv.proxy()->target_algo_info.reset(); | |||||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms_conv = | auto time_in_ms_conv = | ||||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | ||||
@@ -1177,11 +1177,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) { | |||||
.set_dtype(2, dtype::Float32()) | .set_dtype(2, dtype::Float32()) | ||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_rng(1, &rng); | .set_rng(1, &rng); | ||||
bencher.proxy()->target_algo_info.reset(); | |||||
bencher.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; | ||||
bencher_conv.set_param(conv_param); | bencher_conv.set_param(conv_param); | ||||
bencher_conv.proxy()->target_algo_info.reset(); | |||||
bencher_conv.proxy()->target_execution_policy.algo.reset(); | |||||
auto time_in_ms_conv = | auto time_in_ms_conv = | ||||
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; | ||||
@@ -10,6 +10,7 @@ | |||||
*/ | */ | ||||
#include <gtest/gtest.h> | #include <gtest/gtest.h> | ||||
#include "megdnn/basic_types.h" | |||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "test/common/random_state.h" | #include "test/common/random_state.h" | ||||
@@ -21,9 +22,29 @@ class ResetSeedListener : public ::testing::EmptyTestEventListener { | |||||
} | } | ||||
}; | }; | ||||
megdnn::LogLevel min_log_level; | |||||
void set_log_level() { | |||||
megdnn::LogLevel level = megdnn::LogLevel::INFO; | |||||
auto setting = std::getenv("MEGDNN_LOG_LEVEL"); | |||||
if (setting) { | |||||
if (!strcmp(setting, "INFO")) { | |||||
level = megdnn::LogLevel::INFO; | |||||
} else if (!strcmp(setting, "DEBUG")) { | |||||
level = megdnn::LogLevel::DEBUG; | |||||
} else if (!strcmp(setting, "WARN")) { | |||||
level = megdnn::LogLevel::WARN; | |||||
} else { | |||||
megdnn_assert(!strcmp(setting, "ERROR")); | |||||
level = megdnn::LogLevel::ERROR; | |||||
} | |||||
} | |||||
min_log_level = level; | |||||
} | |||||
void log_handler(megdnn::LogLevel level, const char* file, const char* func, | void log_handler(megdnn::LogLevel level, const char* file, const char* func, | ||||
int line, const char* fmt, va_list ap) { | int line, const char* fmt, va_list ap) { | ||||
if (level < megdnn::LogLevel::ERROR) { | |||||
if (level < min_log_level) { | |||||
return; | return; | ||||
} | } | ||||
char msg[1024]; | char msg[1024]; | ||||
@@ -39,6 +60,7 @@ void log_handler(megdnn::LogLevel level, const char* file, const char* func, | |||||
extern "C" int gtest_main(int argc, char** argv) { | extern "C" int gtest_main(int argc, char** argv) { | ||||
::megdnn::set_log_handler(log_handler); | ::megdnn::set_log_handler(log_handler); | ||||
set_log_level(); | |||||
ResetSeedListener listener; | ResetSeedListener listener; | ||||
auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); | auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); | ||||
::testing::InitGoogleTest(&argc, argv); | ::testing::InitGoogleTest(&argc, argv); | ||||
@@ -450,6 +450,7 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) { | |||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
#endif | #endif | ||||
} // namespace test | } // namespace test | ||||
@@ -27,6 +27,7 @@ | |||||
#include "megbrain/gopt/inference.h" | #include "megbrain/gopt/inference.h" | ||||
#include "megbrain/test/helper.h" | #include "megbrain/test/helper.h" | ||||
#include "megdnn/oprs/base.h" | |||||
#include <atomic> | #include <atomic> | ||||
#include <chrono> | #include <chrono> | ||||
@@ -1924,19 +1925,19 @@ TEST(TestGraph, NaiveRecord2NCHW44) { | |||||
namespace { | namespace { | ||||
template <typename DnnOp, typename... Args> | template <typename DnnOp, typename... Args> | ||||
typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||||
typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo( | |||||
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | ||||
Args&& ...args) { | Args&& ...args) { | ||||
if (found.valid()) { | if (found.valid()) { | ||||
if (found.val()) { | if (found.val()) { | ||||
return dnn_op->execution_policy().algo; | |||||
return dnn_op->execution_policy(); | |||||
} else { | } else { | ||||
return {}; | return {}; | ||||
} | } | ||||
} | } | ||||
for (auto&& algo : dnn_op->get_all_algorithms_info( | for (auto&& algo : dnn_op->get_all_algorithms_info( | ||||
std::forward<Args>(args)...)) { | std::forward<Args>(args)...)) { | ||||
dnn_op->execution_policy().algo = algo; | |||||
dnn_op->execution_policy().algo = algo.desc; | |||||
auto layouts = dnn_op->deduce_preprocessed_filter_layout( | auto layouts = dnn_op->deduce_preprocessed_filter_layout( | ||||
std::forward<Args>(args)...); | std::forward<Args>(args)...); | ||||
if (layouts.empty()) continue; | if (layouts.empty()) continue; | ||||
@@ -1949,7 +1950,7 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||||
} | } | ||||
if (valid) { | if (valid) { | ||||
found.emplace(true); | found.emplace(true); | ||||
return algo; | |||||
return {algo.desc, {}}; | |||||
} | } | ||||
} | } | ||||
found.emplace(false); | found.emplace(false); | ||||
@@ -1958,19 +1959,19 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( | |||||
} | } | ||||
template <typename DnnOp, typename... Args> | template <typename DnnOp, typename... Args> | ||||
typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( | |||||
typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo( | |||||
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, | ||||
Args&& ...args) { | Args&& ...args) { | ||||
if (found.valid()) { | if (found.valid()) { | ||||
if (found.val()) { | if (found.val()) { | ||||
return dnn_op->execution_policy().algo; | |||||
return dnn_op->execution_policy(); | |||||
} else { | } else { | ||||
return {}; | return {}; | ||||
} | } | ||||
} | } | ||||
for (auto&& algo : dnn_op->get_all_algorithms_info( | for (auto&& algo : dnn_op->get_all_algorithms_info( | ||||
std::forward<Args>(args)...)) { | std::forward<Args>(args)...)) { | ||||
dnn_op->execution_policy().algo = algo; | |||||
dnn_op->execution_policy().algo = algo.desc; | |||||
auto layouts = dnn_op->deduce_preprocessed_filter_layout( | auto layouts = dnn_op->deduce_preprocessed_filter_layout( | ||||
std::forward<Args>(args)...); | std::forward<Args>(args)...); | ||||
if (layouts.size() <= 1) | if (layouts.size() <= 1) | ||||
@@ -1981,7 +1982,7 @@ typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( | |||||
} | } | ||||
if (valid) { | if (valid) { | ||||
found.emplace(true); | found.emplace(true); | ||||
return algo; | |||||
return {algo.desc, {}}; | |||||
} | } | ||||
} | } | ||||
found.emplace(false); | found.emplace(false); | ||||
@@ -11,6 +11,7 @@ | |||||
*/ | */ | ||||
#include "megbrain/opr/search_policy/algo_chooser.h" | #include "megbrain/opr/search_policy/algo_chooser.h" | ||||
#include "megbrain/opr/internal/megdnn_opr_wrapper.h" | |||||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
#include "megbrain/opr/search_policy/profiler.h" | #include "megbrain/opr/search_policy/profiler.h" | ||||
@@ -21,6 +22,7 @@ | |||||
//! TODO: here has to be know some megdnn::opr when there is produced midout.h | //! TODO: here has to be know some megdnn::opr when there is produced midout.h | ||||
//! fix it if there is another graceful way. | //! fix it if there is another graceful way. | ||||
#include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
#include "megdnn/oprs/base.h" | |||||
#include "midout.h" | #include "midout.h" | ||||
MIDOUT_DECL(megbrain_opr_algo_chooser) | MIDOUT_DECL(megbrain_opr_algo_chooser) | ||||
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { | #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { | ||||
@@ -29,6 +31,8 @@ MIDOUT_DECL(megbrain_opr_algo_chooser) | |||||
MIDOUT_END(); | MIDOUT_END(); | ||||
using mgb::opr::intl::WorkspaceLimitGetter; | using mgb::opr::intl::WorkspaceLimitGetter; | ||||
using namespace megdnn; | |||||
using namespace mgb; | |||||
#define APPLY(statement, ...) \ | #define APPLY(statement, ...) \ | ||||
mgb::apply([&](const auto&... args) { return statement; }, \ | mgb::apply([&](const auto&... args) { return statement; }, \ | ||||
@@ -37,7 +41,7 @@ using mgb::opr::intl::WorkspaceLimitGetter; | |||||
// timeout delta to be added with fastest known algorithm for new algos | // timeout delta to be added with fastest known algorithm for new algos | ||||
constexpr double TIMEOUT_TOLERANCE = 2; | constexpr double TIMEOUT_TOLERANCE = 2; | ||||
#define CACHE_KEY_VERSION "v3" | |||||
#define CACHE_KEY_VERSION "v4" | |||||
namespace { | namespace { | ||||
template <typename Opr> | template <typename Opr> | ||||
@@ -48,44 +52,191 @@ std::string profile_name(Opr* opr) { | |||||
ret.append(opr->get_algorithm_set_name()); | ret.append(opr->get_algorithm_set_name()); | ||||
return ret; | return ret; | ||||
} | } | ||||
template <typename Opr> | |||||
std::string format_fixlayouts( | |||||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | |||||
size_t arity_in, size_t arity_out) { | |||||
std::string ret; | |||||
ret.append(": tensor layouts("); | |||||
for (size_t i = 0; i < arity_in; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i].to_string() + " "); | |||||
ret.append(layouts[i].dtype.name()); | |||||
} | |||||
ret.append(") -> ("); | |||||
for (size_t i = 0; i < arity_out; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i + arity_in].to_string() + " "); | |||||
ret.append(layouts[i + arity_in].dtype.name()); | |||||
} | |||||
return ret; | |||||
} | |||||
///////////////// OprTypeTrait ///////////////////////////// | |||||
template <megdnn::Algorithm::OprType> | |||||
struct OprFromOprTypeTrait; | |||||
template <typename Opr> | |||||
struct OprTypeFromOprTrait; | |||||
#define cb(_opr_type, _opr) \ | |||||
template <> \ | |||||
struct OprFromOprTypeTrait<megdnn::Algorithm::OprType::_opr_type> { \ | |||||
using Opr = megdnn::_opr; \ | |||||
}; \ | |||||
template <> \ | |||||
struct OprTypeFromOprTrait<megdnn::_opr> { \ | |||||
constexpr static megdnn::Algorithm::OprType opr_type = \ | |||||
megdnn::Algorithm::OprType::_opr_type; \ | |||||
} | |||||
cb(MATRIX_MUL_FORWARD, MatrixMulForward); | |||||
cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); | |||||
cb(CONVOLUTION_FORWARD, ConvolutionForward); | |||||
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); | |||||
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); | |||||
cb(CONVOLUTION3D_FORWARD, Convolution3DForward); | |||||
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); | |||||
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); | |||||
cb(LOCAL_SHARE_FORWARD, LocalShareForward); | |||||
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); | |||||
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); | |||||
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); | |||||
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); | |||||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | |||||
cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | |||||
cb(CONVBIAS_FORWARD, ConvBiasForward); | |||||
#undef cb | |||||
// clang-format off | |||||
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ | |||||
cb(MATRIX_MUL_FORWARD, stmt) \ | |||||
cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ | |||||
cb(CONVOLUTION_FORWARD, stmt) \ | |||||
cb(CONVOLUTION_BACKWARD_DATA, stmt) \ | |||||
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ | |||||
cb(CONVOLUTION3D_FORWARD, stmt) \ | |||||
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ | |||||
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ | |||||
cb(LOCAL_SHARE_FORWARD, stmt) \ | |||||
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ | |||||
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ | |||||
cb(DEFORMABLE_CONV_FORWARD, stmt) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | |||||
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | |||||
cb(BATCH_CONV_FORWARD, stmt) \ | |||||
cb(CONVBIAS_FORWARD, stmt) | |||||
// clang-format on | |||||
#define _OPR_TYPE_CASE(_opr_type, _stmt) \ | |||||
case Algorithm::OprType::_opr_type: { \ | |||||
using _Opr = typename OprFromOprTypeTrait< \ | |||||
Algorithm::OprType::_opr_type>::Opr; \ | |||||
_stmt; \ | |||||
break; \ | |||||
} | |||||
#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ | |||||
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ | |||||
_item_idx++) { \ | |||||
auto&& _item = _search_items[_item_idx]; \ | |||||
switch (_item.opr_type) { \ | |||||
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ | |||||
default: \ | |||||
mgb_throw(MegBrainError, "unknown opr_type"); \ | |||||
} \ | |||||
} | |||||
template <typename Opr> | |||||
TensorLayoutArray to_layout_array( | |||||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts) { | |||||
TensorLayoutArray ret; | |||||
for (auto&& layout : layouts) { | |||||
ret.push_back(layout); | |||||
} | |||||
return ret; | |||||
} | } | ||||
template <typename Opr> | |||||
typename opr::AlgoChooser<Opr>::FixedTensorLayouts to_fixed_layouts( | |||||
const TensorLayoutArray& layouts) { | |||||
typename opr::AlgoChooser<Opr>::FixedTensorLayouts ret; | |||||
mgb_assert(ret.size() == layouts.size()); | |||||
size_t idx = 0; | |||||
for (auto&& layout : layouts) { | |||||
ret[idx++] = layout; | |||||
} | |||||
return ret; | |||||
} | |||||
} // namespace | |||||
namespace mgb { | namespace mgb { | ||||
namespace opr { | namespace opr { | ||||
template <typename Opr> | template <typename Opr> | ||||
AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||||
ExeContext& ctx, bool enable_update) { | |||||
AlgoChooserProfileCache cache(ctx.mgb_opr()->comp_node(), | |||||
profile_name(ctx.megdnn_opr()).c_str()); | |||||
TensorLayoutArray origin_layouts = ctx.layouts(); | |||||
typename Opr::Param origin_param = ctx.mgb_opr()->param(); | |||||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||||
origin_layouts.size(), &origin_param, | |||||
sizeof(origin_param)}; | |||||
{ | |||||
auto&& rst = cache.get(cache_key); | |||||
if (rst.valid()) | |||||
return rst.val(); | |||||
std::vector<megdnn::Algorithm::SearchItem> | |||||
AlgoChooser<Opr>::flatten_search_space(const ExeContext& ctx) { | |||||
std::vector<megdnn::Algorithm::SearchItem> ret; | |||||
for (auto algo_info : ctx.get_all_candidates()) { | |||||
megdnn::Algorithm* algo = ctx.get_algorithm_from_desc(algo_info.desc); | |||||
mgb_assert(algo, "Unknown algo description"); | |||||
std::vector<megdnn::Algorithm::SearchItem>&& sub_items = | |||||
algo->get_subopr_list(to_layout_array<Opr>(ctx.layouts()), | |||||
ctx.megdnn_opr()); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); | |||||
megdnn_opr->param() = | |||||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||||
_item.param); | |||||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||||
_item.param, ctx.mgb_opr(), ctx.comp_node(), | |||||
ctx.execution_policy(), ctx.allow_weight_preprocess()); | |||||
auto space = AlgoChooser<_Opr>::flatten_search_space(sub_ctx); | |||||
ret.insert(ret.end(), space.begin(), space.end()); | |||||
}); | |||||
} | } | ||||
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, ctx.param(), | |||||
to_layout_array<Opr>(ctx.layouts())}); | |||||
return ret; | |||||
} | |||||
template <typename Opr> | |||||
void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) { | |||||
if (ctx.get_profile_result_from_cache(require_reproducible).valid()) | |||||
return; | |||||
AlgoChooserProfileCache::Result prof_rst; | AlgoChooserProfileCache::Result prof_rst; | ||||
if (!enable_update) | |||||
return prof_rst; | |||||
std::string str_on_inp_shape = ssprintf( | std::string str_on_inp_shape = ssprintf( | ||||
"on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(), | "on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(), | ||||
ctx.layouts()[1].to_string().c_str()); | ctx.layouts()[1].to_string().c_str()); | ||||
double cur_timeout = 0; | double cur_timeout = 0; | ||||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||||
ctx.owner_graph(), ctx.comp_node(), | |||||
ctx.execution_policy().workspace_limit); | |||||
RealTimer timer; | RealTimer timer; | ||||
for (auto algo : ctx.get_all_candidates_with_workspace_limit()) { | |||||
for (auto algo : ctx.get_all_candidates()) { | |||||
Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst; | Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst; | ||||
std::string msg = ssprintf("profiling %s algorithm %s %s", | std::string msg = ssprintf("profiling %s algorithm %s %s", | ||||
ctx.mgb_opr()->dyn_typeinfo()->name, | ctx.mgb_opr()->dyn_typeinfo()->name, | ||||
algo.name.c_str(), str_on_inp_shape.c_str()); | algo.name.c_str(), str_on_inp_shape.c_str()); | ||||
ImplExecutionPolicy policy; | |||||
policy.algo = algo.desc; | |||||
ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||||
if (ctx.get_workspace_size_bytes(policy) >= workspace_limit) | |||||
continue; | |||||
timer.reset(); | timer.reset(); | ||||
MGB_TRY { cur_rst = ctx.profile_single_algo(algo, cur_timeout); } | |||||
MGB_TRY { cur_rst = ctx.profile_single_algo(policy, cur_timeout); } | |||||
MGB_CATCH(std::exception & exc, { | MGB_CATCH(std::exception & exc, { | ||||
mgb_log_warn("caught exception during %s: %s", msg.c_str(), | mgb_log_warn("caught exception during %s: %s", msg.c_str(), | ||||
exc.what()); | exc.what()); | ||||
@@ -114,120 +265,100 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||||
mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s", | mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s", | ||||
str_on_inp_shape.c_str()); | str_on_inp_shape.c_str()); | ||||
FixedTensorLayouts origin_layouts = ctx.layouts(); | |||||
typename Opr::Param origin_param = ctx.megdnn_opr()->param(); | |||||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||||
origin_layouts.size(), &origin_param, | |||||
sizeof(origin_param)}; | |||||
AlgoChooserProfileCache cache(ctx.comp_node(), | |||||
profile_name(ctx.megdnn_opr()).c_str()); | |||||
cache.put(cache_key, prof_rst); | cache.put(cache_key, prof_rst); | ||||
return prof_rst; | |||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::choose_by_profile( | |||||
ExeContext& ctx, bool require_reproducible, bool enable_update) { | |||||
typename AlgoChooser<Opr>::ImplExecutionPolicy | |||||
AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible, | |||||
bool enable_update) { | |||||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile"))) | MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile"))) | ||||
auto opr = ctx.mgb_opr(); | |||||
if (opr->owner_graph()->options().no_profiling_on_shape_change) { | |||||
auto algo = ctx.megdnn_opr()->execution_policy().algo; | |||||
if (algo.valid()) | |||||
return algo; | |||||
if (ctx.owner_graph()->options().no_profiling_on_shape_change) { | |||||
auto policy = ctx.megdnn_opr()->execution_policy(); | |||||
if (policy.algo.valid()) | |||||
return policy; | |||||
} | } | ||||
std::unordered_map<std::string, ImplAlgo> algo_map; | |||||
for (auto i : ctx.get_all_candidates()) { | |||||
auto ins = algo_map.emplace(i.name.c_str(), i); | |||||
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); | |||||
if (enable_update) { | |||||
auto&& search_items = flatten_search_space(ctx); | |||||
FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); | |||||
megdnn_opr->param() = | |||||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||||
_item.param); | |||||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||||
_item.param, ctx.mgb_opr(), ctx.comp_node(), | |||||
ctx.execution_policy(), ctx.allow_weight_preprocess()); | |||||
AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible); | |||||
}); | |||||
} | } | ||||
auto&& prof = get_profile_result(ctx, enable_update); | |||||
if (prof.empty()) | |||||
return {}; | |||||
for (auto&& i : prof) { | |||||
if ((!require_reproducible || i.reproducible)) { | |||||
auto iter = algo_map.find(i.algo); | |||||
mgb_assert(iter != algo_map.end(), | |||||
"algorithm %s exists in " | |||||
"profiling result but not in algo_map; please " | |||||
"report this " | |||||
"bug; opr: %s{%s}, shapes: %s %s %s", | |||||
i.algo.c_str(), | |||||
ctx.mgb_opr()->cname(), | |||||
ctx.mgb_opr()->dyn_typeinfo()->name, | |||||
ctx.layouts()[0].TensorShape::to_string().c_str(), | |||||
ctx.layouts()[1].TensorShape::to_string().c_str(), | |||||
ctx.layouts()[2].TensorShape::to_string().c_str()); | |||||
return iter->second; | |||||
} | |||||
} | |||||
mgb_log_error( | |||||
"Workspace requirement (%zu) could not be satisfied. Abort now " | |||||
"to " | |||||
"avoid further problems", | |||||
WorkspaceLimitGetter::get_workspace_limit( | |||||
opr->owner_graph(), opr->comp_node(), | |||||
opr->execution_policy().workspace_limit)); | |||||
mgb_trap(); | |||||
typename AlgoChooser<Opr>::ImplExecutionPolicy policy; | |||||
ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||||
return policy; | |||||
MIDOUT_E | MIDOUT_E | ||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
size_t AlgoChooser<Opr>::setup_algo(const TensorLayoutArray& layouts, | |||||
size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||||
Opr* megdnn_opr, const MGBOpr* mgb_opr, | Opr* megdnn_opr, const MGBOpr* mgb_opr, | ||||
bool allow_weight_preprocess) { | bool allow_weight_preprocess) { | ||||
if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { | if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { | ||||
return 0; | return 0; | ||||
} | } | ||||
ImplAlgo algo = {}; | |||||
ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess); | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(megdnn_opr->param(), param_str); | |||||
ExeContext ctx(layouts, megdnn_opr, param_str, mgb_opr, | |||||
mgb_opr->comp_node(), mgb_opr->execution_policy(), | |||||
allow_weight_preprocess); | |||||
ImplExecutionPolicy policy; | |||||
if (auto algo_choose_hook = mgb_opr->algo_chooser()) { | if (auto algo_choose_hook = mgb_opr->algo_chooser()) { | ||||
algo = algo_choose_hook(mgb_opr); | |||||
policy = algo_choose_hook(mgb_opr); | |||||
} | } | ||||
if (!algo.valid()) { | |||||
algo = get_algo(ctx); | |||||
if (!policy.algo.valid()) { | |||||
policy = get_policy(ctx); | |||||
} | } | ||||
size_t workspace = ctx.get_workspace_size_bytes(algo); | |||||
size_t workspace = ctx.get_workspace_size_bytes(policy); | |||||
std::string ret; | std::string ret; | ||||
ret.append(mgb_opr->dyn_typeinfo()->name); | ret.append(mgb_opr->dyn_typeinfo()->name); | ||||
ret.append(": tensor layouts("); | |||||
for (size_t i = 0; i < arity_in; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i].to_string() + " "); | |||||
ret.append(layouts[i].dtype.name()); | |||||
} | |||||
ret.append(") -> ("); | |||||
for (size_t i = 0; i < arity_out; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i + arity_in].to_string() + " "); | |||||
ret.append(layouts[i + arity_in].dtype.name()); | |||||
} | |||||
ret.append("): algo=" + algo.name); | |||||
ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out); | |||||
Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); | |||||
mgb_assert(palgo, "Unknown algo description"); | |||||
ret.append("): algo=" + std::string(palgo->name())); | |||||
ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d", | ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d", | ||||
workspace / (1024 * 1024.0), algo.is_reproducible)); | |||||
workspace / (1024 * 1024.0), palgo->is_reproducible())); | |||||
mgb_log_debug("%s", ret.c_str()); | mgb_log_debug("%s", ret.c_str()); | ||||
megdnn_opr->execution_policy() = {algo}; | |||||
megdnn_opr->execution_policy() = policy; | |||||
return workspace; | return workspace; | ||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo( | |||||
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy( | |||||
ExeContext& ctx) { | ExeContext& ctx) { | ||||
using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | ||||
MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE); | MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE); | ||||
switch (ctx.mgb_opr()->execution_policy().strategy) { | |||||
switch (ctx.execution_policy().strategy) { | |||||
case S::HEURISTIC: | case S::HEURISTIC: | ||||
return ctx.choose_by_heuristic(); | return ctx.choose_by_heuristic(); | ||||
case S::HEURISTIC_REPRODUCIBLE: | case S::HEURISTIC_REPRODUCIBLE: | ||||
return ctx.choose_by_heuristic(true); | return ctx.choose_by_heuristic(true); | ||||
case S::PROFILE_HEURISTIC: { | case S::PROFILE_HEURISTIC: { | ||||
ImplAlgo algo = choose_by_profile(ctx, false, false); | |||||
if (!algo.valid()) | |||||
algo = ctx.choose_by_heuristic(); | |||||
return algo; | |||||
ImplExecutionPolicy policy = choose_by_profile(ctx, false, false); | |||||
if (!policy.algo.valid()) | |||||
policy = ctx.choose_by_heuristic(); | |||||
return policy; | |||||
} | } | ||||
#if MGB_ENABLE_FASTRUN | #if MGB_ENABLE_FASTRUN | ||||
case S::PROFILE: | case S::PROFILE: | ||||
@@ -241,16 +372,17 @@ typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo( | |||||
} | } | ||||
#define INST(Opr) \ | #define INST(Opr) \ | ||||
template AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||||
AlgoChooser<megdnn::Opr>::get_algo(ExeContext& ctx); \ | |||||
template AlgoChooserProfileCache::Result \ | |||||
AlgoChooser<megdnn::Opr>::get_profile_result(ExeContext& ctx, \ | |||||
bool enable_update); \ | |||||
template AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||||
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||||
AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx); \ | |||||
template void AlgoChooser<megdnn::Opr>::profile( \ | |||||
ExeContext& ctx, bool require_reproducible); \ | |||||
template std::vector<megdnn::Algorithm::SearchItem> \ | |||||
AlgoChooser<megdnn::Opr>::flatten_search_space(const ExeContext& ctx); \ | |||||
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||||
AlgoChooser<megdnn::Opr>::choose_by_profile( \ | AlgoChooser<megdnn::Opr>::choose_by_profile( \ | ||||
ExeContext& ctx, bool require_reproducible, bool enable_update); \ | ExeContext& ctx, bool require_reproducible, bool enable_update); \ | ||||
template size_t AlgoChooser<megdnn::Opr>::setup_algo( \ | template size_t AlgoChooser<megdnn::Opr>::setup_algo( \ | ||||
const TensorLayoutArray& layouts, megdnn::Opr* megdnn_opr, \ | |||||
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | |||||
const MGBOpr* mgb_opr, bool allow_weight_preprocess); | const MGBOpr* mgb_opr, bool allow_weight_preprocess); | ||||
MGB_FOREACH_FASTRUN_OPR(INST) | MGB_FOREACH_FASTRUN_OPR(INST) | ||||
@@ -258,17 +390,109 @@ MGB_FOREACH_FASTRUN_OPR(INST) | |||||
#undef INST | #undef INST | ||||
//////////////////////////////// ExeContext ///////////////////////////// | //////////////////////////////// ExeContext ///////////////////////////// | ||||
template <typename Opr> | |||||
AlgoChooser<Opr>::ExeContext::ExeContext( | |||||
const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||||
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, | |||||
const CompNode& cn, | |||||
const megdnn::param::ExecutionPolicy& execution_policy, | |||||
bool allow_weight_preprocess) | |||||
: m_layouts{layouts}, | |||||
m_megdnn_opr{megdnn_opr}, | |||||
m_param{param_str}, | |||||
m_base_mgb_opr{mgb_opr}, | |||||
m_cn{cn}, | |||||
m_execution_policy{execution_policy}, | |||||
m_allow_weight_preprocess{allow_weight_preprocess} { | |||||
mgb_assert(m_layouts.size() == layouts.size()); | |||||
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | |||||
std::tuple_size<FixedTensorLayouts>::value == 5 || | |||||
std::tuple_size<FixedTensorLayouts>::value == 8, | |||||
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " | |||||
"deformable conv)"); | |||||
} | |||||
template <typename Opr> | template <typename Opr> | ||||
typename AlgoChooser<Opr>::ImplAlgo | typename AlgoChooser<Opr>::ImplAlgo | ||||
AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache( | |||||
bool require_reproducible) const { | |||||
MIDOUT_B(Opr, | |||||
midout_iv(MGB_HASH_STR( | |||||
"AlgoChooser::ExeContext::get_profile_result_from_cache"))) | |||||
AlgoChooserProfileCache cache(m_cn, | |||||
profile_name(m_megdnn_opr).c_str()); | |||||
typename Opr::Param origin_param = m_megdnn_opr->param(); | |||||
AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(), | |||||
&origin_param, sizeof(origin_param)}; | |||||
auto&& rst = cache.get(cache_key); | |||||
if (!rst.valid()) | |||||
return {}; | |||||
auto&& prof = rst.val(); | |||||
std::unordered_map<std::string, ImplAlgo> algo_map; | |||||
for (auto i : get_all_candidates()) { | |||||
auto ins = algo_map.emplace(i.name.c_str(), i); | |||||
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); | |||||
} | |||||
if (prof.empty()) | |||||
return {}; | |||||
for (auto&& i : prof) { | |||||
if ((!require_reproducible || i.reproducible)) { | |||||
auto iter = algo_map.find(i.algo); | |||||
mgb_assert(iter != algo_map.end(), | |||||
"algorithm %s exists in " | |||||
"profiling result but not in algo_map; please " | |||||
"report this " | |||||
"bug; opr: %s{%s}, layouts: %s ", | |||||
i.algo.c_str(), m_base_mgb_opr->cname(), | |||||
m_base_mgb_opr->dyn_typeinfo()->name, | |||||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out) | |||||
.c_str()); | |||||
return iter->second; | |||||
} | |||||
} | |||||
mgb_log_error( | |||||
"Workspace requirement (%zu) could not be satisfied. Abort now " | |||||
"to " | |||||
"avoid further problems", | |||||
WorkspaceLimitGetter::get_workspace_limit( | |||||
m_base_mgb_opr->owner_graph(), m_cn, | |||||
m_execution_policy.workspace_limit)); | |||||
mgb_trap(); | |||||
MIDOUT_E | |||||
} | |||||
template <typename Opr> | |||||
typename AlgoChooser<Opr>::ImplExecutionPolicy | |||||
AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const { | AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const { | ||||
auto opr = m_mgb_opr; | |||||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | ||||
opr->owner_graph(), opr->comp_node(), | |||||
opr->execution_policy().workspace_limit); | |||||
return APPLY(m_megdnn_opr->get_algorithm_info_heuristic( | |||||
args..., workspace_limit, reproducible), | |||||
m_layouts); | |||||
owner_graph(), m_cn, m_execution_policy.workspace_limit); | |||||
ImplExecutionPolicy policy; | |||||
policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic( | |||||
args..., workspace_limit, reproducible), | |||||
m_layouts).desc; | |||||
Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||||
mgb_assert(algo, "Unknown algo description"); | |||||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||||
to_layout_array<Opr>(m_layouts), m_megdnn_opr); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||||
megdnn_opr->param() = | |||||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||||
_item.param); | |||||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||||
_item.param, m_base_mgb_opr, m_cn, m_execution_policy, | |||||
m_allow_weight_preprocess); | |||||
policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible)); | |||||
}); | |||||
return policy; | |||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
@@ -279,40 +503,58 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const { | |||||
APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts); | APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts); | ||||
bool found = false; | bool found = false; | ||||
for (size_t i = 0; i < ret.size(); ++i) { | for (size_t i = 0; i < ret.size(); ++i) { | ||||
if (ret[i] == heu) { | |||||
if (ret[i].desc == heu.algo) { | |||||
found = true; | found = true; | ||||
std::swap(ret[i], ret[0]); | std::swap(ret[i], ret[0]); | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(heu.algo); | |||||
mgb_assert(palgo, "Unknown algo description"); | |||||
mgb_assert(found, | mgb_assert(found, | ||||
"algo %s got by heuristic not found in " | "algo %s got by heuristic not found in " | ||||
"candidate list", | "candidate list", | ||||
heu.name.c_str()); | |||||
palgo->name()); | |||||
return std::move(ret); | return std::move(ret); | ||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
std::vector<typename AlgoChooser<Opr>::ImplAlgo> | |||||
AlgoChooser<Opr>::ExeContext::get_all_candidates_with_workspace_limit() const { | |||||
auto&& all_algos = get_all_candidates(); | |||||
auto opr = m_mgb_opr; | |||||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||||
opr->owner_graph(), opr->comp_node(), | |||||
opr->execution_policy().workspace_limit); | |||||
std::vector<ImplAlgo> ret; | |||||
for (auto&& algo : all_algos) { | |||||
if (get_workspace_size_bytes(algo) <= workspace_limit) { | |||||
ret.push_back(algo); | |||||
} | |||||
void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache( | |||||
bool require_reproducible, | |||||
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const { | |||||
if (!policy.algo.valid()) { | |||||
policy.algo = get_profile_result_from_cache(require_reproducible).desc; | |||||
mgb_assert(policy.algo.valid(), | |||||
"No cache found, maybe some error occured"); | |||||
} | } | ||||
return ret; | |||||
Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||||
mgb_assert(algo, "Unknown algo description"); | |||||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||||
to_layout_array<Opr>(m_layouts), m_megdnn_opr); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||||
megdnn_opr->param() = | |||||
Algorithm::deserialize_read_pod<typename _Opr::Param>( | |||||
_item.param); | |||||
typename AlgoChooser<_Opr>::ExeContext sub_ctx( | |||||
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), | |||||
_item.param, m_base_mgb_opr, m_cn, m_execution_policy, | |||||
m_allow_weight_preprocess); | |||||
policy.sub_policy.push_back({}); | |||||
sub_ctx.construct_execution_policy_from_cache(require_reproducible, | |||||
policy.sub_policy.back()); | |||||
}); | |||||
return; | |||||
} | } | ||||
template <typename Opr> | template <typename Opr> | ||||
size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( | size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( | ||||
ImplAlgo algo) const { | |||||
m_megdnn_opr->execution_policy() = {algo}; | |||||
const ImplExecutionPolicy& policy) const { | |||||
m_megdnn_opr->execution_policy() = policy; | |||||
size_t result; | size_t result; | ||||
if_constexpr<opr_supports_preprocess<Opr>()>( | if_constexpr<opr_supports_preprocess<Opr>()>( | ||||
[&](auto _) { | [&](auto _) { | ||||
@@ -336,17 +578,13 @@ size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( | |||||
template <typename Opr> | template <typename Opr> | ||||
Maybe<AlgoChooserProfileCache::ResultEntry> | Maybe<AlgoChooserProfileCache::ResultEntry> | ||||
AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, | |||||
double& timeout) const { | |||||
AlgoChooser<Opr>::ExeContext::profile_single_algo( | |||||
const ImplExecutionPolicy& policy, double& timeout) const { | |||||
typename TimedProfiler<Opr>::Param param; | typename TimedProfiler<Opr>::Param param; | ||||
auto name = algo.name.c_str(); | |||||
// force check copy size <= dest len-1 from gcc8 for safe | // force check copy size <= dest len-1 from gcc8 for safe | ||||
auto len = sizeof(param.algo_name); | |||||
strncpy(param.algo_name, name, len - 1); | |||||
param.algo_name[len - 1] = '\0'; | |||||
mgb_assert(!param.algo_name[sizeof(param.algo_name) - 2], | |||||
"algo name too long: %s; len=%zu", name, strlen(name)); | |||||
param.workspace = get_workspace_size_bytes(algo); | |||||
param.execution_policy = | |||||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy); | |||||
param.workspace = get_workspace_size_bytes(policy); | |||||
for (int i = 0; i < arity; ++i) { | for (int i = 0; i < arity; ++i) { | ||||
auto&& src = m_layouts[i]; | auto&& src = m_layouts[i]; | ||||
mgb_assert(src.format.is_default() && | mgb_assert(src.format.is_default() && | ||||
@@ -357,23 +595,25 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo, | |||||
src.to_string().c_str()); | src.to_string().c_str()); | ||||
param.dtypes[i] = src.dtype.enumv(); | param.dtypes[i] = src.dtype.enumv(); | ||||
} | } | ||||
param.comp_node_loc = m_mgb_opr->output(0)->comp_node().locator(); | |||||
param.comp_node_loc = m_cn.locator(); | |||||
mgb_assert(param.shapes.size() == m_layouts.size()); | mgb_assert(param.shapes.size() == m_layouts.size()); | ||||
for (size_t i = 0; i < param.shapes.size(); ++i) | for (size_t i = 0; i < param.shapes.size(); ++i) | ||||
param.shapes[i] = m_layouts[i]; | param.shapes[i] = m_layouts[i]; | ||||
param.opr_param = m_megdnn_opr->param(); | param.opr_param = m_megdnn_opr->param(); | ||||
param.allow_weight_preprocess = m_allow_weight_preprocess; | param.allow_weight_preprocess = m_allow_weight_preprocess; | ||||
Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||||
mgb_assert(palgo, "Unknown algo description"); | |||||
auto rst = TimedProfiler<Opr>::profile(param, timeout); | auto rst = TimedProfiler<Opr>::profile(param, timeout); | ||||
// MIOpen conv profiles all available algos when a specfic shape is | // MIOpen conv profiles all available algos when a specfic shape is | ||||
// provided for the first time, which probably adds to the result time. | // provided for the first time, which probably adds to the result time. | ||||
// Therefore, a second profile execution is needed. | // Therefore, a second profile execution is needed. | ||||
if (strncmp(name, "MIOpen", 6) == 0) | |||||
if (strncmp(palgo->name(), "MIOpen", 6) == 0) | |||||
rst = TimedProfiler<Opr>::profile(param, timeout); | rst = TimedProfiler<Opr>::profile(param, timeout); | ||||
if (!rst.valid()) | if (!rst.valid()) | ||||
return None; | return None; | ||||
return AlgoChooserProfileCache::ResultEntry{ | return AlgoChooserProfileCache::ResultEntry{ | ||||
algo.name.c_str(), algo.is_reproducible, rst.val().time, | |||||
palgo->name(), palgo->is_reproducible(), rst.val().time, | |||||
param.workspace}; | param.workspace}; | ||||
} | } | ||||
@@ -414,21 +654,34 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const { | |||||
} | } | ||||
#define INST(Opr) \ | #define INST(Opr) \ | ||||
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||||
template AlgoChooser<megdnn::Opr>::ExeContext::ExeContext( \ | |||||
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ | |||||
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, \ | |||||
const CompNode& cn, \ | |||||
const megdnn::param::ExecutionPolicy& execution_policy, \ | |||||
bool allow_weight_preprocess); \ | |||||
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ | |||||
AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \ | AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \ | ||||
bool reproducible) const; \ | bool reproducible) const; \ | ||||
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \ | |||||
AlgoChooser<megdnn::Opr>::ExeContext::get_profile_result_from_cache( \ | |||||
bool require_reproducible) const; \ | |||||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | ||||
AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \ | AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \ | ||||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | |||||
AlgoChooser<megdnn::Opr>::ExeContext:: \ | |||||
get_all_candidates_with_workspace_limit() const; \ | |||||
template size_t \ | template size_t \ | ||||
AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \ | AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \ | ||||
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo) const; \ | |||||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||||
policy) const; \ | |||||
template void AlgoChooser<megdnn::Opr>::ExeContext:: \ | |||||
construct_execution_policy_from_cache( \ | |||||
bool require_reproducible, \ | |||||
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||||
policy) const; \ | |||||
template Maybe<AlgoChooserProfileCache::ResultEntry> \ | template Maybe<AlgoChooserProfileCache::ResultEntry> \ | ||||
AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \ | AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \ | ||||
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo, double& timeout) \ | |||||
const; \ | |||||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||||
policy, \ | |||||
double& timeout) const; | |||||
MGB_FOREACH_FASTRUN_OPR(INST) | MGB_FOREACH_FASTRUN_OPR(INST) | ||||
@@ -14,6 +14,8 @@ | |||||
#include "../internal/invoke.h" | #include "../internal/invoke.h" | ||||
#include "../internal/megdnn_opr_wrapper.inl" | #include "../internal/megdnn_opr_wrapper.inl" | ||||
#include "megdnn/handle.h" | |||||
#include "megdnn/oprs/base.h" | |||||
#if MGB_ROCM | #if MGB_ROCM | ||||
#include "hcc_detail/hcc_defs_prologue.h" | #include "hcc_detail/hcc_defs_prologue.h" | ||||
@@ -32,12 +34,96 @@ MIDOUT_DECL(megbrain_opr_profile) | |||||
} \ | } \ | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
namespace { | |||||
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { | |||||
std::string ret; | |||||
//! serialize AlgorithmDesc | |||||
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); | |||||
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); | |||||
uint32_t param_size = policy.algo.param.size(); | |||||
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret); | |||||
ret += policy.algo.param; | |||||
//! serialize sub_policy | |||||
uint32_t size = policy.sub_policy.size(); | |||||
megdnn::Algorithm::serialize_write_pod(size, ret); | |||||
for (auto&& sub : policy.sub_policy) { | |||||
ret += serialize_policy(sub); | |||||
} | |||||
return ret; | |||||
} | |||||
megdnn::ExecutionPolicy deserialize_policy(const char* buf, uint32_t size, | |||||
uint32_t& offset) { | |||||
megdnn::ExecutionPolicy ret; | |||||
#define cb(_val, _type) \ | |||||
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ | |||||
offset += sizeof(_val) | |||||
cb(ret.algo.handle_type, megdnn::Handle::HandleType); | |||||
cb(ret.algo.type, uint32_t); | |||||
uint32_t param_size = 0; | |||||
cb(param_size, uint32_t); | |||||
if (param_size > 0) { | |||||
ret.algo.param = std::string(buf + offset, param_size); | |||||
offset += param_size; | |||||
} | |||||
uint32_t nr_policy = 0; | |||||
cb(nr_policy, uint32_t); | |||||
#undef cb | |||||
for (uint32_t i = 0; i < nr_policy; i++) { | |||||
ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); | |||||
} | |||||
return ret; | |||||
} | |||||
} | |||||
namespace mgb { | namespace mgb { | ||||
namespace opr { | namespace opr { | ||||
#define APPLY(statement, ...) \ | #define APPLY(statement, ...) \ | ||||
mgb::apply([&](const auto&... args) { return statement; }, \ | mgb::apply([&](const auto&... args) { return statement; }, \ | ||||
std::tuple_cat(__VA_ARGS__)) | std::tuple_cat(__VA_ARGS__)) | ||||
////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// | |||||
template <typename Opr> | |||||
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob | |||||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize( | |||||
const megdnn::ExecutionPolicy& policy) { | |||||
ExecutionPolicyBlob ret; | |||||
std::string serialize_bin = serialize_policy(policy); | |||||
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); | |||||
memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); | |||||
ret.size = serialize_bin.size(); | |||||
return ret; | |||||
} | |||||
template <typename Opr> | |||||
megdnn::ExecutionPolicy | |||||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() const { | |||||
uint32_t offset = 0; | |||||
auto&& ret = deserialize_policy(data, size, offset); | |||||
mgb_assert(offset == size); | |||||
return std::move(ret); | |||||
} | |||||
#define INST(Opr) \ | |||||
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \ | |||||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \ | |||||
const megdnn::ExecutionPolicy& policy); \ | |||||
template megdnn::ExecutionPolicy \ | |||||
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() \ | |||||
const; | |||||
MGB_FOREACH_FASTRUN_OPR(INST) | |||||
#undef INST | |||||
////////////////// TimedProfiler ////////////////////////////// | |||||
template <typename Opr> | template <typename Opr> | ||||
const double TimedProfiler<Opr>::timeout_setting = | const double TimedProfiler<Opr>::timeout_setting = | ||||
TimedProfiler<Opr>::init_timeout_setting(); | TimedProfiler<Opr>::init_timeout_setting(); | ||||
@@ -99,18 +185,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
} | } | ||||
megdnn_opr->param() = param.opr_param; | megdnn_opr->param() = param.opr_param; | ||||
{ | |||||
typename Opr::AlgorithmInfo algo; | |||||
for (auto i : | |||||
APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) { | |||||
if (!strcmp(i.name.c_str(), param.algo_name)) { | |||||
algo = i; | |||||
break; | |||||
} | |||||
} | |||||
mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name); | |||||
megdnn_opr->execution_policy() = {algo}; | |||||
} | |||||
megdnn_opr->execution_policy() = param.execution_policy.deserialize(); | |||||
// Allocate preprocessed weight buffers. | // Allocate preprocessed weight buffers. | ||||
TensorLayoutArray preprocessed_layout; | TensorLayoutArray preprocessed_layout; | ||||
@@ -222,13 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
}); | }); | ||||
ev_end->record(); | ev_end->record(); | ||||
megdnn::Algorithm* algo = megdnn_opr->get_algorithm_from_desc( | |||||
megdnn_opr->execution_policy().algo); | |||||
mgb_assert(algo); | |||||
double next_report_time = 0.5; | double next_report_time = 0.5; | ||||
while (!ev_end->finished()) { | while (!ev_end->finished()) { | ||||
if (timer.get_secs() >= next_report_time) { | if (timer.get_secs() >= next_report_time) { | ||||
mgb_log_warn( | mgb_log_warn( | ||||
"profiling conv algo %s already took %.3f/%.3f secs" | "profiling conv algo %s already took %.3f/%.3f secs" | ||||
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | ||||
param.algo_name, timer.get_secs(), param.actual_timeout); | |||||
algo->name(), timer.get_secs(), param.actual_timeout); | |||||
next_report_time = timer.get_secs() + 1; | next_report_time = timer.get_secs() + 1; | ||||
} | } | ||||
using namespace std::literals; | using namespace std::literals; | ||||
@@ -46,7 +46,7 @@ private: | |||||
static bool check_layout(const TensorLayout& layout, int transpose); | static bool check_layout(const TensorLayout& layout, int transpose); | ||||
//! store the policy of all transpose situations | //! store the policy of all transpose situations | ||||
megdnn::MatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; | |||||
megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; | |||||
}; | }; | ||||
/*! | /*! | ||||
@@ -76,7 +76,7 @@ private: | |||||
static bool check_layout(const TensorLayout& layout, bool transpose); | static bool check_layout(const TensorLayout& layout, bool transpose); | ||||
//! store the policy of all transpose situations | //! store the policy of all transpose situations | ||||
megdnn::BatchedMatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; | |||||
megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; | |||||
}; | }; | ||||
/*! | /*! | ||||
@@ -12,9 +12,14 @@ | |||||
#pragma once | #pragma once | ||||
#include <memory> | |||||
#include "megbrain/graph/cg.h" | |||||
#include "megbrain/graph/operator_node.h" | |||||
#include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||||
#include "megbrain/opr/search_policy/profiler.h" | #include "megbrain/opr/search_policy/profiler.h" | ||||
#include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
#include "megbrain/opr/blas.h" | #include "megbrain/opr/blas.h" | ||||
#include "megdnn/oprs/base.h" | |||||
template <class MegDNNOpr> | template <class MegDNNOpr> | ||||
struct MegDNNOpr2MGBOpr; | struct MegDNNOpr2MGBOpr; | ||||
@@ -49,52 +54,64 @@ class AlgoChooser { | |||||
static constexpr int arity = OprArityTrait<Opr>::arity; | static constexpr int arity = OprArityTrait<Opr>::arity; | ||||
using ImplAlgo = typename Opr::AlgorithmInfo; | using ImplAlgo = typename Opr::AlgorithmInfo; | ||||
using ImplExecutionPolicy = megdnn::ExecutionPolicy; | |||||
using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; | using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; | ||||
using TensorLayoutArray = std::array<TensorLayout, arity>; | |||||
public: | |||||
using FixedTensorLayouts = std::array<TensorLayout, arity>; | |||||
class ExeContext { | class ExeContext { | ||||
const TensorLayoutArray& m_layouts; | |||||
FixedTensorLayouts m_layouts; | |||||
Opr* m_megdnn_opr; | Opr* m_megdnn_opr; | ||||
const MGBOpr* m_mgb_opr; | |||||
std::string m_param; | |||||
const cg::OperatorNodeBase* m_base_mgb_opr; | |||||
CompNode m_cn; | |||||
megdnn::param::ExecutionPolicy m_execution_policy; | |||||
bool m_allow_weight_preprocess; | bool m_allow_weight_preprocess; | ||||
public: | public: | ||||
ExeContext(const TensorLayoutArray& layouts, Opr* megdnn_opr, | |||||
const MGBOpr* mgb_opr, bool allow_weight_preprocess) | |||||
: m_layouts{layouts}, | |||||
m_megdnn_opr{megdnn_opr}, | |||||
m_mgb_opr{mgb_opr}, | |||||
m_allow_weight_preprocess{allow_weight_preprocess} { | |||||
mgb_assert(m_layouts.size() == layouts.size()); | |||||
static_assert( | |||||
std::tuple_size<TensorLayoutArray>::value == 3 || | |||||
std::tuple_size<TensorLayoutArray>::value == 5 || | |||||
std::tuple_size<TensorLayoutArray>::value == 8, | |||||
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " | |||||
"deformable conv)"); | |||||
} | |||||
ExeContext(const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||||
const std::string& param_str, | |||||
const cg::OperatorNodeBase* mgb_opr, const CompNode& cn, | |||||
const megdnn::param::ExecutionPolicy& execution_policy, | |||||
bool allow_weight_preprocess); | |||||
Opr* megdnn_opr() const { return m_megdnn_opr; } | Opr* megdnn_opr() const { return m_megdnn_opr; } | ||||
const MGBOpr* mgb_opr() const { return m_mgb_opr; } | |||||
const TensorLayout& inp_layout(size_t idx) const { | const TensorLayout& inp_layout(size_t idx) const { | ||||
return m_layouts[idx]; | return m_layouts[idx]; | ||||
} | } | ||||
const TensorLayoutArray& layouts() const { return m_layouts; } | |||||
cg::ComputingGraph* owner_graph() const { | |||||
return m_base_mgb_opr->owner_graph(); | |||||
} | |||||
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | |||||
const megdnn::param::ExecutionPolicy& execution_policy() const { | |||||
return m_execution_policy; | |||||
} | |||||
CompNode comp_node() const { return m_cn; } | |||||
const std::string& param() const { return m_param; } | |||||
bool allow_weight_preprocess() const { | |||||
return m_allow_weight_preprocess; | |||||
} | |||||
megdnn::Algorithm* get_algorithm_from_desc( | |||||
const megdnn::Algorithm::Info::Desc& desc) const { | |||||
return m_megdnn_opr->get_algorithm_from_desc(desc); | |||||
} | |||||
const FixedTensorLayouts& layouts() const { return m_layouts; } | |||||
ImplAlgo choose_by_heuristic(bool reproducible = false) const; | |||||
ImplExecutionPolicy choose_by_heuristic( | |||||
bool reproducible = false) const; | |||||
//! get all candidate algos, and the one choose_by_heuristic() is | //! get all candidate algos, and the one choose_by_heuristic() is | ||||
//! put first | //! put first | ||||
std::vector<ImplAlgo> get_all_candidates() const; | std::vector<ImplAlgo> get_all_candidates() const; | ||||
//! get candidate algos with workspace limit. | |||||
std::vector<ImplAlgo> get_all_candidates_with_workspace_limit() const; | |||||
//! get workspace size required for specific algo | |||||
size_t get_workspace_size_bytes(ImplAlgo algo) const; | |||||
//! get workspace size required for specific execution policy | |||||
size_t get_workspace_size_bytes( | |||||
const ImplExecutionPolicy& policy) const; | |||||
/*! | /*! | ||||
* \brief profile a single algorithm | * \brief profile a single algorithm | ||||
@@ -106,28 +123,59 @@ class AlgoChooser { | |||||
* timeout used during profiling | * timeout used during profiling | ||||
*/ | */ | ||||
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( | Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( | ||||
ImplAlgo algo, double& timeout) const; | |||||
const ImplExecutionPolicy& policy, double& timeout) const; | |||||
//! get all profile algorithm from cache, return invalid if not exists | |||||
ImplAlgo get_profile_result_from_cache(bool require_reproducible) const; | |||||
/** | |||||
* \brief construct execution policy from cache. | |||||
* | |||||
* \param require_reproducible select algo which is reproducible | |||||
* \param policy execution policy | |||||
*/ | |||||
void construct_execution_policy_from_cache( | |||||
bool require_reproducible, ImplExecutionPolicy& policy) const; | |||||
private: | private: | ||||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | ||||
}; | }; | ||||
//! entrance for getting algorithm according to execution strategy | |||||
static ImplAlgo get_algo(ExeContext& ctx); | |||||
//! get all profile result, either by retrieving cache or profiling | |||||
static AlgoChooserProfileCache::Result get_profile_result( | |||||
ExeContext& ctx, bool enable_update); | |||||
template<typename U> | |||||
friend class AlgoChooser; | |||||
static ImplAlgo choose_by_profile(ExeContext& ctx, | |||||
bool require_reproducible, | |||||
bool enable_update = true); | |||||
private: | |||||
//! entrance for getting algorithm according to execution strategy | |||||
static ImplExecutionPolicy get_policy(ExeContext& ctx); | |||||
//! profile and save to cache | |||||
static void profile(ExeContext& ctx, bool require_reproducible); | |||||
static ImplExecutionPolicy choose_by_profile(ExeContext& ctx, | |||||
bool require_reproducible, | |||||
bool enable_update = true); | |||||
/** | |||||
* flatten search space in postorder traversal | |||||
* The subopr search construct a search tree | |||||
* | |||||
* A | |||||
* / \ | |||||
* B1B2 C | |||||
* / \ | |||||
* D1D2D3 E | |||||
* We use postorder traverse the search tree. | |||||
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A | |||||
*/ | |||||
static std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||||
const ExeContext& ctx); | |||||
public: | public: | ||||
/*! | /*! | ||||
* \brief setup algorithm and return workspace size | * \brief setup algorithm and return workspace size | ||||
*/ | */ | ||||
static size_t setup_algo(const TensorLayoutArray& layouts, Opr* megdnn_opr, | |||||
static size_t setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||||
const MGBOpr* mgb_opr, | const MGBOpr* mgb_opr, | ||||
bool allow_weight_preprocess = false); | bool allow_weight_preprocess = false); | ||||
}; | }; | ||||
@@ -28,9 +28,9 @@ namespace mixin { | |||||
class AlgoChooserHelper : cg::OperatorNodeMixinBase { | class AlgoChooserHelper : cg::OperatorNodeMixinBase { | ||||
public: | public: | ||||
using ExecutionPolicy = megdnn::param::ExecutionPolicy; | using ExecutionPolicy = megdnn::param::ExecutionPolicy; | ||||
using AlgorithmInfo = megdnn::detail::Algorithm::Info; | |||||
using AlgorithmPolicy = megdnn::ExecutionPolicy; | |||||
using AlgoChooserHook = | using AlgoChooserHook = | ||||
std::function<AlgorithmInfo(const cg::OperatorNodeBase*)>; | |||||
std::function<AlgorithmPolicy(const cg::OperatorNodeBase*)>; | |||||
const ExecutionPolicy& execution_policy() const { | const ExecutionPolicy& execution_policy() const { | ||||
if (!m_policy_accessed) { | if (!m_policy_accessed) { | ||||
@@ -18,6 +18,7 @@ | |||||
#include "megbrain/comp_node.h" | #include "megbrain/comp_node.h" | ||||
#include "megdnn/basic_types.h" | #include "megdnn/basic_types.h" | ||||
#include "megdnn/oprs/base.h" | |||||
#include "megdnn/oprs/linalg.h" | #include "megdnn/oprs/linalg.h" | ||||
#include "megdnn/oprs/nn.h" | #include "megdnn/oprs/nn.h" | ||||
@@ -139,7 +140,17 @@ class TimedProfiler { | |||||
public: | public: | ||||
struct Param { | struct Param { | ||||
char algo_name[128]; | |||||
struct ExecutionPolicyBlob { | |||||
//! enlarge the max size if needed | |||||
constexpr static size_t MAX_SIZE_IN_BYTES = 10240; | |||||
char data[MAX_SIZE_IN_BYTES]; | |||||
uint32_t size; | |||||
static ExecutionPolicyBlob serialize( | |||||
const megdnn::ExecutionPolicy& policy); | |||||
megdnn::ExecutionPolicy deserialize() const; | |||||
}; | |||||
ExecutionPolicyBlob execution_policy; | |||||
size_t workspace; | size_t workspace; | ||||
megdnn::DTypeEnum dtypes[arity]; | megdnn::DTypeEnum dtypes[arity]; | ||||
CompNode::Locator comp_node_loc; | CompNode::Locator comp_node_loc; | ||||
@@ -20,11 +20,13 @@ | |||||
#include "megbrain/opr/basic_arith.h" | #include "megbrain/opr/basic_arith.h" | ||||
#include "megbrain/gopt/inference.h" | #include "megbrain/gopt/inference.h" | ||||
#include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
#include "megdnn/dtype.h" | |||||
#include "megdnn/oprs/base.h" | #include "megdnn/oprs/base.h" | ||||
#include <gmock/gmock.h> | #include <gmock/gmock.h> | ||||
#include <cmath> | #include <cmath> | ||||
#include <memory> | |||||
#include <random> | #include <random> | ||||
using namespace mgb; | using namespace mgb; | ||||
@@ -37,6 +39,73 @@ using Mode = Param::Mode; | |||||
Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION}; | Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION}; | ||||
void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, | |||||
std::shared_ptr<HostTensorND>& dest, | |||||
const opr::ConvolutionBackwardData::Param& param) { | |||||
mgb_assert(param.format == Param::Format::NCHW); | |||||
auto &&data = *inps[0], &&filter = *inps[1]; | |||||
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); | |||||
size_t GROUP, ICPG, OCPG, FH, FW; | |||||
if (param.sparse == Param::Sparse::DENSE) { | |||||
GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1), | |||||
FH = filter.shape(2), FW = filter.shape(3); | |||||
} else { | |||||
mgb_assert(param.sparse == Param::Sparse::GROUP); | |||||
GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2), | |||||
FH = filter.shape(3), FW = filter.shape(4); | |||||
} | |||||
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, | |||||
size_t dilate) { | |||||
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; | |||||
}; | |||||
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h), | |||||
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w); | |||||
dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"), | |||||
TensorShape{N, OCPG * GROUP, OH, OW}); | |||||
auto&& out = *dest; | |||||
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), | |||||
optr = out.ptr<float>(); | |||||
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); | |||||
auto ol = out.layout(), fl = filter.layout(); | |||||
#define FOR2(a, A, b, B) \ | |||||
for (size_t a = 0; a < A; ++a) \ | |||||
for (size_t b = 0; b < B; ++b) | |||||
#define FOR3(a, A, b, B, c, C) \ | |||||
FOR2(a, A, b, B) \ | |||||
for (size_t c = 0; c < C; ++c) | |||||
FOR3(n, N, group, GROUP, icg, ICPG) | |||||
FOR2(ih, IH, iw, IW) { | |||||
float scale = *(dptr++); | |||||
FOR3(ocg, OCPG, fh, FH, fw, FW) { | |||||
auto oc_tot = group * OCPG + ocg; | |||||
int oh = int(ih * param.stride_h + fh * param.dilate_h) - | |||||
int(param.pad_h), | |||||
ow = int(iw * param.stride_w + fw * param.dilate_w) - | |||||
int(param.pad_w); | |||||
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) && | |||||
ow < static_cast<int>(OW)) { | |||||
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + | |||||
oh * ol.stride[2] + ow; | |||||
size_t flt_off = 0; | |||||
if (param.sparse == Param::Convolution::Sparse::DENSE) { | |||||
flt_off = icg * fl.stride[0] + | |||||
ocg * fl.stride[1] + fh * fl.stride[2] + fw; | |||||
} else { | |||||
flt_off = group * fl.stride[0] + icg * fl.stride[1] + | |||||
ocg * fl.stride[2] + fh * fl.stride[3] + fw; | |||||
} | |||||
optr[out_off] += scale * fptr[flt_off]; | |||||
} | |||||
} | |||||
} | |||||
#undef FOR3 | |||||
#undef FOR2 | |||||
} | |||||
void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, | void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, | ||||
std::shared_ptr<HostTensorND>& out, | std::shared_ptr<HostTensorND>& out, | ||||
const opr::ConvolutionBackwardFilter::Param& param) { | const opr::ConvolutionBackwardFilter::Param& param) { | ||||
@@ -370,7 +439,8 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||||
PersistentCacheHook cache_hook{on_get}; | PersistentCacheHook cache_hook{on_get}; | ||||
#if MGB_ENABLE_FASTRUN | #if MGB_ENABLE_FASTRUN | ||||
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { | |||||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||||
S::PROFILE_HEURISTIC}) { | |||||
#else | #else | ||||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | ||||
#endif | #endif | ||||
@@ -406,6 +476,95 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||||
} | } | ||||
} | } | ||||
TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) { | |||||
REQUIRE_GPU(1); | |||||
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1}; | |||||
param.compute_mode = Param::ComputeMode::FLOAT32; | |||||
using Policy = opr::Convolution::ExecutionPolicy; | |||||
using S = Policy::Strategy; | |||||
auto gen_bfp16 = [](HostTensorND& dest) { | |||||
RNGxorshf rng{next_rand_seed()}; | |||||
auto rand_real = [&rng]() { | |||||
std::uniform_real_distribution<float> dist(-1, 1); | |||||
return dist(rng); | |||||
}; | |||||
auto ptr = dest.ptr<dt_bfloat16>(); | |||||
size_t elems = dest.shape().total_nr_elems(); | |||||
for (size_t i = 0; i < elems; i++) { | |||||
ptr[i] = dt_bfloat16(rand_real()); | |||||
} | |||||
}; | |||||
auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src) | |||||
-> std::shared_ptr<HostTensorND> { | |||||
auto ret = std::make_shared<HostTensorND>( | |||||
src->comp_node(), src->shape(), dtype::BFloat16{}); | |||||
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { | |||||
ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i]; | |||||
} | |||||
return ret; | |||||
}; | |||||
auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src) | |||||
-> std::shared_ptr<HostTensorND> { | |||||
auto ret = std::make_shared<HostTensorND>( | |||||
src->comp_node(), src->shape(), dtype::Float32{}); | |||||
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { | |||||
ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i]; | |||||
} | |||||
return ret; | |||||
}; | |||||
int nr_get = 0; | |||||
auto on_get = [&nr_get](const std::string&, const void*, size_t, | |||||
const void*, size_t) { ++nr_get; }; | |||||
PersistentCacheHook cache_hook{on_get}; | |||||
#if MGB_ENABLE_FASTRUN | |||||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||||
S::PROFILE_HEURISTIC}) { | |||||
#else | |||||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | |||||
#endif | |||||
using Checker = AutoOprChecker<2, 1>; | |||||
auto make_graph = [&](const Checker::SymInpArray& inputs) | |||||
-> Checker::SymOutArray { | |||||
Policy policy; | |||||
policy.strategy = strategy; | |||||
return {opr::ConvolutionBackwardData::make_deconv( | |||||
inputs[0], inputs[1], param, policy)}; | |||||
}; | |||||
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { | |||||
std::shared_ptr<HostTensorND> out; | |||||
conv_bwd_data_brute( | |||||
{bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out, | |||||
param); | |||||
dest[0] = *f32_to_bf16(out); | |||||
}; | |||||
Checker::RunOptions opt; | |||||
opt.outputs_max_err = 1e-3; | |||||
nr_get = 0; | |||||
Checker(make_graph, fwd) | |||||
.disable_grad_check() | |||||
.set_input_dtype(0, dtype::BFloat16{}) | |||||
.set_input_dtype(1, dtype::BFloat16{}) | |||||
.set_input_generator(0, gen_bfp16) | |||||
.set_input_generator(1, gen_bfp16) | |||||
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt) | |||||
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt) | |||||
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt); | |||||
if (strategy == S::HEURISTIC) { | |||||
ASSERT_EQ(0, nr_get); | |||||
} else { | |||||
ASSERT_LT(0, nr_get); | |||||
} | |||||
} | |||||
} | |||||
TEST(TestOprDNN, Deconvolution) { | TEST(TestOprDNN, Deconvolution) { | ||||
// dilated grouped deconv | // dilated grouped deconv | ||||
using Checker = AutoOprChecker<2, 1>; | using Checker = AutoOprChecker<2, 1>; | ||||
@@ -420,55 +579,9 @@ TEST(TestOprDNN, Deconvolution) { | |||||
}; | }; | ||||
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { | auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { | ||||
auto &&data = *inp[0], &&filter = *inp[1]; | |||||
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); | |||||
size_t GROUP = filter.shape(0), ICPG = filter.shape(1), | |||||
OCPG = filter.shape(2), FH = filter.shape(3), | |||||
FW = filter.shape(4); | |||||
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, | |||||
size_t dilate) { | |||||
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; | |||||
}; | |||||
auto &&out = dest[0]; | |||||
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, | |||||
param.dilate_h), | |||||
OW = get_shp(IW, FW, param.stride_w, param.pad_w, | |||||
param.dilate_w); | |||||
out.resize({N, OCPG * GROUP, OH, OW}); | |||||
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), | |||||
optr = out.ptr<float>(); | |||||
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); | |||||
auto ol = out.layout(), fl = filter.layout(); | |||||
#define FOR2(a, A, b, B) \ | |||||
for (size_t a = 0; a < A; ++a) \ | |||||
for (size_t b = 0; b < B; ++b) | |||||
#define FOR3(a, A, b, B, c, C) \ | |||||
FOR2(a, A, b, B) \ | |||||
for (size_t c = 0; c < C; ++c) | |||||
FOR3(n, N, group, GROUP, icg, ICPG) | |||||
FOR2(ih, IH, iw, IW) { | |||||
float scale = *(dptr++); | |||||
FOR3(ocg, OCPG, fh, FH, fw, FW) { | |||||
auto oc_tot = group * OCPG + ocg; | |||||
int oh = int(ih * param.stride_h + fh * param.dilate_h) - | |||||
int(param.pad_h), | |||||
ow = int(iw * param.stride_w + fw * param.dilate_w) - | |||||
int(param.pad_w); | |||||
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) && | |||||
ow < static_cast<int>(OW)) { | |||||
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + | |||||
oh * ol.stride[2] + ow, | |||||
flt_off = group * fl.stride[0] + icg * fl.stride[1] + | |||||
ocg * fl.stride[2] + fh * fl.stride[3] + fw; | |||||
optr[out_off] += scale * fptr[flt_off]; | |||||
} | |||||
} | |||||
} | |||||
#undef FOR3 | |||||
#undef FOR2 | |||||
std::shared_ptr<HostTensorND> out; | |||||
conv_bwd_data_brute({inp[0], inp[1]}, out, param); | |||||
dest[0] = *out; | |||||
}; | }; | ||||
Checker::RunOptions opt; | Checker::RunOptions opt; | ||||
@@ -1547,7 +1660,8 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) { | |||||
PersistentCacheHook cache_hook{on_get}; | PersistentCacheHook cache_hook{on_get}; | ||||
#if MGB_ENABLE_FASTRUN | #if MGB_ENABLE_FASTRUN | ||||
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { | |||||
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, | |||||
S::PROFILE_HEURISTIC}) { | |||||
#else | #else | ||||
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { | ||||
#endif | #endif | ||||
@@ -2004,29 +2118,34 @@ TEST(TestOprDNN, HeuristicReproducible) { | |||||
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | ||||
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | ||||
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||||
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||||
static_cast<opr::ConvolutionBackwardFilter*>( | static_cast<opr::ConvolutionBackwardFilter*>( | ||||
bwd_flt->owner_opr()) | bwd_flt->owner_opr()) | ||||
->megdnn_opr()) | |||||
->execution_policy() | |||||
.algo; | |||||
->megdnn_opr()); | |||||
auto&& algo = megdnn_opr->execution_policy().algo; | |||||
megdnn::Algorithm* palgo = | |||||
megdnn_opr->get_algorithm_from_desc(algo); | |||||
mgb_assert(palgo, "Unknown algo description"); | |||||
if (strategy == S::HEURISTIC_REPRODUCIBLE) { | if (strategy == S::HEURISTIC_REPRODUCIBLE) { | ||||
EXPECT_TRUE(algo.is_reproducible); | |||||
EXPECT_TRUE(palgo->is_reproducible()); | |||||
} | } | ||||
algo_name0 = algo.name.c_str(); | |||||
algo_name0 = palgo->name(); | |||||
} | } | ||||
{ | { | ||||
Checker checker(make_graph, fwd); | Checker checker(make_graph, fwd); | ||||
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) | checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) | ||||
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) | ||||
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); | ||||
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||||
static_cast<opr::ConvolutionBackwardFilter*>( | |||||
bwd_flt->owner_opr()) | |||||
->megdnn_opr()) | |||||
->execution_policy() | |||||
.algo; | |||||
algo_name1 = algo.name.c_str(); | |||||
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>( | |||||
static_cast<opr::ConvolutionBackwardFilter*>( | |||||
bwd_flt->owner_opr()) | |||||
->megdnn_opr()); | |||||
auto&& algo = megdnn_opr->execution_policy().algo; | |||||
megdnn::Algorithm* palgo = | |||||
megdnn_opr->get_algorithm_from_desc(algo); | |||||
mgb_assert(palgo, "Unknown algo description"); | |||||
algo_name1 = palgo->name(); | |||||
} | } | ||||
EXPECT_TRUE(algo_name0 == algo_name1); | EXPECT_TRUE(algo_name0 == algo_name1); | ||||
} | } | ||||
@@ -2286,6 +2405,8 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) { | |||||
MockAlgorithm algo; | MockAlgorithm algo; | ||||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | ||||
.WillRepeatedly(Return(&algo)); | .WillRepeatedly(Return(&algo)); | ||||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||||
.WillRepeatedly(Return(&algo)); | |||||
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | ||||
.WillRepeatedly(Return(0)); | .WillRepeatedly(Return(0)); | ||||
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | ||||
@@ -2318,6 +2439,9 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||||
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)) | EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)) | ||||
.WillRepeatedly(Return(filter_layout)); | .WillRepeatedly(Return(filter_layout)); | ||||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||||
.WillRepeatedly(Return(&algo)); | |||||
Expectation algo_call = | Expectation algo_call = | ||||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | ||||
.WillOnce(Return(&algo)); | .WillOnce(Return(&algo)); | ||||
@@ -2349,7 +2473,6 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||||
pf->tensors[0].ptr<float>()[0] = 114.514f; | pf->tensors[0].ptr<float>()[0] = 114.514f; | ||||
pf->tensors[1].ptr<float>()[0] = 1926.0817f; | pf->tensors[1].ptr<float>()[0] = 1926.0817f; | ||||
})); | })); | ||||
// Run the graph multiple times. | // Run the graph multiple times. | ||||
for (int i = 0; i < 3; i++) { | for (int i = 0; i < 3; i++) { | ||||
if (i > 0) { | if (i > 0) { | ||||
@@ -2381,6 +2504,8 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||||
MockAlgorithm algo; | MockAlgorithm algo; | ||||
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) | ||||
.WillRepeatedly(Return(&algo)); | .WillRepeatedly(Return(&algo)); | ||||
EXPECT_CALL(mock, get_algorithm_from_desc(_)) | |||||
.WillRepeatedly(Return(&algo)); | |||||
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) | ||||
.WillRepeatedly(Return(0)); | .WillRepeatedly(Return(0)); | ||||
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) | ||||
@@ -16,157 +16,157 @@ namespace { | |||||
const char* EXPECTED_TEXT_OUT_REC[3] = { | const char* EXPECTED_TEXT_OUT_REC[3] = { | ||||
// rec level 0 | // rec level 0 | ||||
R"OUTPUT( | R"OUTPUT( | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s | val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | ||||
val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s | val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s | val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: [2.352, 0.1114, -0.2721] s | [i0]var11: [2.352, 0.1114, -0.2721] s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s | val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: [2] s | [i0]var3: [2] s | ||||
[i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s | [i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s | ||||
val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s | val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s | [i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s | ||||
[i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s | [i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s | ||||
val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s | val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s | val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | ||||
val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s | val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s | val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: [0.05521, 0.724, 1.134] s | [i0]var11: [0.05521, 0.724, 1.134] s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s | val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: [2] s | [i0]var3: [2] s | ||||
[i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s | [i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s | ||||
val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s | val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s | [i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s | ||||
[i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s | [i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s | ||||
val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s | val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s | val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | ||||
val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s | val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s | val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: [-0.5069, 0.4525, 0.1695] s | [i0]var11: [-0.5069, 0.4525, 0.1695] s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s | val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: [2] s | [i0]var3: [2] s | ||||
[i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s | [i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s | ||||
val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s | val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s | [i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s | ||||
[i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s | [i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s | ||||
val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s | val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s | val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | ||||
val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s | val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s | val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: [-0.03637, 2.111, 0.3236] s | [i0]var11: [-0.03637, 2.111, 0.3236] s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s | val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: [2] s | [i0]var3: [2] s | ||||
[i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s | [i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s | ||||
val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s | val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s | [i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s | ||||
[i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s | [i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s | ||||
val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s | val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s | ||||
var1 produced: name=var1 layout={5(4),4(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={5(4),4(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s | val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s | ||||
var17 produced: name=var17 layout={5(4),4(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={5(4),4(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | ||||
val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s | val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s | ||||
var11 produced: name=var11 layout={1(4),4(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(4),4(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s | val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s | ||||
var13 produced: name=var13 layout={5(0),4(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={5(0),4(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: [-1.199, -1.02, 1.098, -1.472] s | [i0]var11: [-1.199, -1.02, 1.098, -1.472] s | ||||
[i1]var9: <host value[s]> [5, 4] s | [i1]var9: <host value[s]> [5, 4] s | ||||
val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s | val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s | ||||
var15 produced: name=var15 layout={5(4),4(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={5(4),4(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: [2] s | [i0]var3: [2] s | ||||
[i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s | [i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s | ||||
val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s | val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s | ||||
var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={5(4),4(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: [0, 0, 3.401, 0, 0, ...] s | [i0]var15: [0, 0, 3.401, 0, 0, ...] s | ||||
[i1]var17: [0, 0, 1.098, 0, 0, ...] s | [i1]var17: [0, 0, 1.098, 0, 0, ...] s | ||||
@@ -176,33 +176,33 @@ var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 | |||||
// rec level 1 | // rec level 1 | ||||
R"OUTPUT( | R"OUTPUT( | ||||
==== begin lazy value recording | ==== begin lazy value recording | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: <see lazy value below> s | [i0]var1: <see lazy value below> s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: <see lazy value below> s | [i0]var1: <see lazy value below> s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: <see lazy value below> s | [i0]var11: <see lazy value below> s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: <see lazy value below> s | [i0]var3: <see lazy value below> s | ||||
[i1]var13: <see lazy value below> s | [i1]var13: <see lazy value below> s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: <see lazy value below> s | [i0]var15: <see lazy value below> s | ||||
[i1]var17: <see lazy value below> s | [i1]var17: <see lazy value below> s | ||||
@@ -242,33 +242,33 @@ var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
// rec level 2 | // rec level 2 | ||||
R"OUTPUT( | R"OUTPUT( | ||||
==== begin lazy value recording | ==== begin lazy value recording | ||||
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 | |||||
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 | |||||
deps: | deps: | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 | |||||
deps: | deps: | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 | |||||
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 | |||||
deps: | deps: | ||||
[i0]var1: <see lazy value below> s | [i0]var1: <see lazy value below> s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 | |||||
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 | |||||
deps: | deps: | ||||
[i0]var1: <see lazy value below> s | [i0]var1: <see lazy value below> s | ||||
[i1]var5: <host value[s]> [0] s | [i1]var5: <host value[s]> [0] s | ||||
[i2]var7: <host value[s]> [1] s | [i2]var7: <host value[s]> [1] s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 | |||||
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 | |||||
deps: | deps: | ||||
[i0]var11: <see lazy value below> s | [i0]var11: <see lazy value below> s | ||||
[i1]var9: <host value[s]> [2, 3] s | [i1]var9: <host value[s]> [2, 3] s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 | |||||
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 | |||||
deps: | deps: | ||||
[i0]var3: <see lazy value below> s | [i0]var3: <see lazy value below> s | ||||
[i1]var13: <see lazy value below> s | [i1]var13: <see lazy value below> s | ||||
val: <see lazy value below> s | val: <see lazy value below> s | ||||
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 | |||||
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 | |||||
deps: | deps: | ||||
[i0]var15: <see lazy value below> s | [i0]var15: <see lazy value below> s | ||||
[i1]var17: <see lazy value below> s | [i1]var17: <see lazy value below> s | ||||