Browse Source

feat(mgb): add recursive for fastrun and megdnn test

GitOrigin-RevId: 743846f645
tags/v1.3.0
Megvii Engine Team 4 years ago
parent
commit
f14e0c17e7
38 changed files with 1680 additions and 560 deletions
  1. +2
    -0
      dnn/include/megdnn/basic_types.h
  2. +67
    -6
      dnn/include/megdnn/oprs/base.h
  3. +4
    -4
      dnn/src/common/algo_chooser.h
  4. +63
    -8
      dnn/src/common/basic_types.cpp
  5. +3
    -3
      dnn/src/cuda/batched_matrix_mul/brute_force.cpp
  6. +2
    -2
      dnn/src/cuda/conv_bias/bfloat16.cpp
  7. +2
    -6
      dnn/src/cuda/convolution/backward_data/algo.cpp
  8. +8
    -14
      dnn/src/cuda/convolution/backward_data/algo.h
  9. +55
    -37
      dnn/src/cuda/convolution/backward_data/bfloat16.cpp
  10. +2
    -2
      dnn/src/cuda/convolution/backward_filter/bfloat16.cpp
  11. +3
    -3
      dnn/src/cuda/convolution/opr_impl.cpp
  12. +11
    -0
      dnn/src/cuda/convolution/opr_impl.h
  13. +1
    -1
      dnn/src/cuda/matrix_mul/bfloat16.cpp
  14. +3
    -4
      dnn/src/fallback/conv_bias/opr_impl.cpp
  15. +2
    -2
      dnn/src/fallback/convolution/opr_impl.cpp
  16. +1
    -1
      dnn/src/fallback/matrix_mul/opr_impl.cpp
  17. +1
    -1
      dnn/test/common/benchmarker.h
  18. +72
    -28
      dnn/test/common/checker.h
  19. +3
    -3
      dnn/test/common/convolution.cpp
  20. +47
    -0
      dnn/test/common/fast_run_cache.cpp
  21. +58
    -0
      dnn/test/common/fast_run_cache.h
  22. +298
    -67
      dnn/test/common/opr_proxy.h
  23. +1
    -1
      dnn/test/cuda/batch_conv_bias.cpp
  24. +6
    -6
      dnn/test/cuda/chanwise_convolution.cpp
  25. +2
    -2
      dnn/test/cuda/conv_bias_int8.cpp
  26. +95
    -30
      dnn/test/cuda/convolution.cpp
  27. +9
    -9
      dnn/test/cuda/local_share.cpp
  28. +23
    -1
      dnn/test/gtest_main.cpp
  29. +1
    -0
      dnn/test/x86/convolution.cpp
  30. +9
    -8
      src/core/test/graph/misc.cpp
  31. +395
    -142
      src/opr/impl/search_policy/algo_chooser.cpp
  32. +91
    -13
      src/opr/impl/search_policy/profiler.cpp
  33. +2
    -2
      src/opr/include/megbrain/opr/blas.h
  34. +85
    -37
      src/opr/include/megbrain/opr/search_policy/algo_chooser.h
  35. +2
    -2
      src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h
  36. +12
    -1
      src/opr/include/megbrain/opr/search_policy/profiler.h
  37. +190
    -65
      src/opr/test/dnn/convolution.cpp
  38. +49
    -49
      src/plugin/test/opr_io_dump_text_out.h

+ 2
- 0
dnn/include/megdnn/basic_types.h View File

@@ -330,6 +330,8 @@ struct TensorLayout : public TensorShape {
/* =================== properties =================== */ /* =================== properties =================== */


std::string to_string() const; std::string to_string() const;

std::string serialize() const;
#endif // MEGDNN_CC_HOST #endif // MEGDNN_CC_HOST


/*! /*!


+ 67
- 6
dnn/include/megdnn/oprs/base.h View File

@@ -11,6 +11,7 @@
*/ */
#pragma once #pragma once


#include <type_traits>
#include "megdnn/basic_types.h" #include "megdnn/basic_types.h"
#include "megdnn/handle.h" #include "megdnn/handle.h"


@@ -144,8 +145,11 @@ public:
return {{handle_type(), type(), param()}, name(), is_reproducible()}; return {{handle_type(), type(), param()}, name(), is_reproducible()};
} }


Info::Desc desc() const { return {handle_type(), type(), param()}; }

template <typename T> template <typename T>
static void serialize_write_pod(const T& val, std::string& result) { static void serialize_write_pod(const T& val, std::string& result) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
result.append(reinterpret_cast<const char*>(&val), sizeof(T)); result.append(reinterpret_cast<const char*>(&val), sizeof(T));
} }


@@ -155,6 +159,7 @@ public:


template <typename T> template <typename T>
static T deserialize_read_pod(const std::string& data, size_t offset = 0) { static T deserialize_read_pod(const std::string& data, size_t offset = 0) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
T ret; T ret;
//! A pointer to an object or incomplete type may be converted to a //! A pointer to an object or incomplete type may be converted to a
//! pointer to a different object or incomplete type. If the resulting //! pointer to a different object or incomplete type. If the resulting
@@ -167,10 +172,69 @@ public:
return ret; return ret;
} }


template <typename T>
static T deserialize_read_pod(const char* data, size_t offset = 0) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
T ret;
//! A pointer to an object or incomplete type may be converted to a
//! pointer to a different object or incomplete type. If the resulting
//! pointer is not correctly aligned for the pointed-to type, the
//! behavior is undefined.
//!
//! so here we should use memcpy instead of
//! *reinterpret_cast<const T*>(&data[offset]);
memcpy(&ret, data + offset, sizeof(T));
return ret;
}

enum class OprType : uint32_t {
MATRIX_MUL_FORWARD,
BATCHED_MATRIX_MUL_FORWARD,
CONVOLUTION_FORWARD,
CONVOLUTION_BACKWARD_DATA,
CONVOLUTION_BACKWARD_FILTER,
CONVOLUTION3D_FORWARD,
CONVOLUTION3D_BACKWARD_DATA,
CONVOLUTION3D_BACKWARD_FILTER,
LOCAL_SHARE_FORWARD,
LOCAL_SHARE_BACKWARD_DATA,
LOCAL_SHARE_BACKWARD_FILTER,
DEFORMABLE_CONV_FORWARD,
DEFORMABLE_CONV_BACKWARD_DATA,
DEFORMABLE_CONV_BACKWARD_FILTER,
CONVBIAS_FORWARD,
BATCH_CONV_FORWARD,
};

struct SearchItem {
OprType opr_type;
//! serialized param
std::string param;
TensorLayoutArray layouts;
};

/**
* \brief get subopr list of the algo
*
* \param layouts origin layouts of the parent opr
* \param opr parent opr
*/
virtual std::vector<SearchItem> get_subopr_list(const TensorLayoutArray&,
const OperatorBase*) const {
return {};
}

protected: protected:
Handle::HandleType m_handle_type = Handle::HandleType::NAIVE; Handle::HandleType m_handle_type = Handle::HandleType::NAIVE;
}; };


//! policy for executing the operator
struct ExecutionPolicy {
//! INVALID_ALGO_TYPE algo_type means using heuristic
Algorithm::Info::Desc algo;
std::vector<ExecutionPolicy> sub_policy;
};

/*! /*!
* \brief define Algorithm and ExecutionPolicy for oprs that have * \brief define Algorithm and ExecutionPolicy for oprs that have
* multiple impl algos * multiple impl algos
@@ -198,12 +262,6 @@ public:
*/ */
virtual const char* get_algorithm_set_name() const = 0; virtual const char* get_algorithm_set_name() const = 0;


//! policy for executing the operator
struct ExecutionPolicy {
//! INVALID_ALGO_TYPE algo_type means using heuristic
AlgorithmInfo algo;
};

ExecutionPolicy& execution_policy() { return m_execution_policy; } ExecutionPolicy& execution_policy() { return m_execution_policy; }


const ExecutionPolicy& execution_policy() const { const ExecutionPolicy& execution_policy() const {
@@ -464,6 +522,9 @@ protected:
bool reproducible = false) = 0; bool reproducible = false) = 0;
}; };
} // namespace detail } // namespace detail

using Algorithm = detail::Algorithm;
using ExecutionPolicy = detail::ExecutionPolicy;
} // namespace megdnn } // namespace megdnn


#include "megdnn/internal/visibility_epilogue.h" #include "megdnn/internal/visibility_epilogue.h"


+ 4
- 4
dnn/src/common/algo_chooser.h View File

@@ -25,17 +25,17 @@ namespace megdnn {
*/ */
template <class Opr, typename... Args> template <class Opr, typename... Args>
typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) {
typename Opr::AlgorithmInfo ret;
typename Opr::AlgorithmDesc ret;
auto set = opr->execution_policy().algo; auto set = opr->execution_policy().algo;
if (set.valid()) { if (set.valid()) {
ret = set; ret = set;
} else { } else {
ret = opr->get_algorithm_info_heuristic( ret = opr->get_algorithm_info_heuristic(
std::forward<Args>(args)..., std::numeric_limits<size_t>::max(), std::forward<Args>(args)..., std::numeric_limits<size_t>::max(),
false);
false).desc;
} }
return static_cast<typename Opr::AlgoBase*>( return static_cast<typename Opr::AlgoBase*>(
opr->get_algorithm_from_desc(ret.desc));
opr->get_algorithm_from_desc(ret));
} }


/*! /*!
@@ -46,7 +46,7 @@ template <class Opr, typename... Args>
typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) { typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) {
auto set = opr->execution_policy().algo; auto set = opr->execution_policy().algo;
if (set.valid()) { if (set.valid()) {
return opr->algo_pack().construct_and_get_algo(set.desc);
return opr->algo_pack().construct_and_get_algo(set);
} else { } else {
return static_cast<typename Opr::AlgoBase*>( return static_cast<typename Opr::AlgoBase*>(
opr->get_algorithm_heuristic(std::forward<Args>(args)..., opr->get_algorithm_heuristic(std::forward<Args>(args)...,


+ 63
- 8
dnn/src/common/basic_types.cpp View File

@@ -20,6 +20,7 @@
#include <mutex> #include <mutex>
#include <numeric> #include <numeric>
#include <tuple> #include <tuple>
#include <type_traits>


using namespace megdnn; using namespace megdnn;


@@ -35,6 +36,26 @@ class DefaultErrorHandler final : public ErrorHandler {
#endif #endif
} }
}; };

template <typename T>
void serialize_pod(const T& val, std::string& result) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
result.append(reinterpret_cast<const char*>(&val), sizeof(T));
}

template <typename T>
void serialize_vec(const T* val, size_t size, std::string& result) {
result.append(reinterpret_cast<const char*>(val), sizeof(T) * size);
}

template <typename T>
T deserialize_pod(const std::string& data, size_t& offset) {
T ret;
memcpy(&ret, data.data() + offset, sizeof(T));
offset += sizeof(T);
return ret;
}

} // namespace } // namespace
ErrorHandler* ErrorHandler::sm_inst; ErrorHandler* ErrorHandler::sm_inst;


@@ -126,17 +147,23 @@ bool TensorShape::eq_shape(const TensorShape& rhs) const {
size_t eq = 0; size_t eq = 0;
switch (ndim) { switch (ndim) {
case 7: case 7:
eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU
eq += shape[6] == rhs.shape[6];
MEGDNN_FALLTHRU
case 6: case 6:
eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU
eq += shape[5] == rhs.shape[5];
MEGDNN_FALLTHRU
case 5: case 5:
eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU
eq += shape[4] == rhs.shape[4];
MEGDNN_FALLTHRU
case 4: case 4:
eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU
eq += shape[3] == rhs.shape[3];
MEGDNN_FALLTHRU
case 3: case 3:
eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU
eq += shape[2] == rhs.shape[2];
MEGDNN_FALLTHRU
case 2: case 2:
eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU
eq += shape[1] == rhs.shape[1];
MEGDNN_FALLTHRU
case 1: case 1:
eq += shape[0] == rhs.shape[0]; eq += shape[0] == rhs.shape[0];
} }
@@ -435,8 +462,8 @@ bool TensorLayout::try_reshape(TensorLayout& result,
for (size_t i = 0; i < tshp.ndim; ++i) { for (size_t i = 0; i < tshp.ndim; ++i) {
if (!tshp.shape[i]) { if (!tshp.shape[i]) {
megdnn_throw_if(!format.is_default(), tensor_reshape_error, megdnn_throw_if(!format.is_default(), tensor_reshape_error,
megdnn_mangle(ssprintf("bad target tshp: %s",
tshp.to_string().c_str())));
megdnn_mangle(ssprintf("bad target tshp: %s",
tshp.to_string().c_str())));
is_empty_shape = true; is_empty_shape = true;
break; break;
} }
@@ -510,8 +537,36 @@ std::string TensorLayout::to_string() const {
rst.append(" @ "); rst.append(" @ ");
rst.append(format.impl()->to_string()); rst.append(format.impl()->to_string());
} }
rst.append(std::string(" ") + dtype.name());
rst.append("}"); rst.append("}");
return rst; return rst;
} }


std::string TensorLayout::serialize() const {
std::string rst;
serialize_pod<size_t>(ndim, rst);
serialize_vec<size_t>(shape, ndim, rst);
serialize_vec<ptrdiff_t>(stride, ndim, rst);
rst.append(format.impl()->to_string());

//! serialize dtype
serialize_pod(dtype.enumv(), rst);
if (dtype.has_param()) {
switch (dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<dtype::_dt>::enumv: \
serialize_pod(dtype::_dt::downcast_from(dtype).param(), rst); \
break;
MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb)
#undef cb
default:
megdnn_assert(false,
"cannot serialize unknown parameterized DType");
break;
}
}

return rst;
}

// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen

+ 3
- 3
dnn/src/cuda/batched_matrix_mul/brute_force.cpp View File

@@ -24,7 +24,7 @@ bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available(
const SizeArgs& args) const { const SizeArgs& args) const {
MatrixMulForwardImpl mm{args.opr->handle()}; MatrixMulForwardImpl mm{args.opr->handle()};
mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB};
mm.execution_policy() = {m_algorithm->info()};
mm.execution_policy() = {m_algorithm->desc(), {}};


auto mm_layout_a = args.layout_a.remove_axis(0); auto mm_layout_a = args.layout_a.remove_axis(0);
auto mm_layout_b = args.layout_b.remove_axis(0); auto mm_layout_b = args.layout_b.remove_axis(0);
@@ -39,7 +39,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes(
auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>();
mm_opr->param() = {args.opr->param().transposeA, mm_opr->param() = {args.opr->param().transposeA,
args.opr->param().transposeB}; args.opr->param().transposeB};
mm_opr->execution_policy() = {m_algorithm->info()};
mm_opr->execution_policy() = {m_algorithm->desc(), {}};


return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b,
args.layout_c); args.layout_c);
@@ -50,7 +50,7 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec(
auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>(); auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>();
mm_opr->param() = {args.opr->param().transposeA, mm_opr->param() = {args.opr->param().transposeA,
args.opr->param().transposeB}; args.opr->param().transposeB};
mm_opr->execution_policy() = {m_algorithm->info()};
mm_opr->execution_policy() = {m_algorithm->desc(), {}};
rep(n, N) { rep(n, N) {
TensorND A_, B_, C_; TensorND A_, B_, C_;
auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) {


+ 2
- 2
dnn/src/cuda/conv_bias/bfloat16.cpp View File

@@ -47,7 +47,7 @@ ConvBiasForwardImpl::AlgoBFloat16::float_args(
change_dtype(fdst); change_dtype(fdst);
opr->param() = args.opr->param(); opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT; opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_impl->info()};
opr->execution_policy() = {m_impl->desc(), {}};
return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst);
} }


@@ -110,7 +110,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
auto convbias_opr = args.handle->create_operator<ConvBias>(); auto convbias_opr = args.handle->create_operator<ConvBias>();
convbias_opr->param() = args.opr->param(); convbias_opr->param() = args.opr->param();
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
convbias_opr->execution_policy() = {m_impl->info()};
convbias_opr->execution_policy() = {m_impl->desc(), {}};
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor,
fdst_tensor, nullptr, cvter.workspace()); fdst_tensor, nullptr, cvter.workspace());
} }


+ 2
- 6
dnn/src/cuda/convolution/backward_data/algo.cpp View File

@@ -46,12 +46,8 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
megdnn_assert(all_algos_data == all_algos.data()); megdnn_assert(all_algos_data == all_algos.data());


non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul
size_t algo_size = all_algos.size();
for (size_t i=0; i<algo_size; ++i) {
bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i]));
all_algos.push_back(bfloat16_refhold.back().get());
bfloat16_algos.push_back(bfloat16_refhold.back().get());
}
all_algos.push_back(&bfloat16);
bfloat16_algos.push_back(&bfloat16);


for (auto&& algo : all_algos) { for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo); m_all_algos_map.emplace(algo->info().desc, algo);


+ 8
- 14
dnn/src/cuda/convolution/backward_data/algo.h View File

@@ -170,28 +170,22 @@ public:


class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
public: public:
AlgoBFloat16(ConvolutionBackwardDataImpl::AlgoBase*);
bool is_available(const SizeArgs& args) const override; bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override; void exec(const ExecArgs& args) const override;


const char* name() const override { return m_name.c_str(); }
std::vector<SearchItem> get_subopr_list(
const TensorLayoutArray& layouts,
const OperatorBase* opr) const override;

const char* name() const override {
return "CONVOLUTION_BACKWARD_DATD_BFLOAT16";
}
bool is_reproducible() const override { return true; } bool is_reproducible() const override { return true; }


private: private:
std::string m_name;
ConvolutionBackwardDataImpl::AlgoBase* m_algorithm = nullptr;
SizeArgs float_args(const SizeArgs& args, ConvolutionBackwardDataImpl* opr,
TensorLayout& fsrc, TensorLayout& ffilter,
TensorLayout& fdst) const;
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)

std::string param() const override {
std::string ret;
serialize_write_pod(m_algorithm, ret);
return ret;
}
}; };


//! implement group conv by another algo //! implement group conv by another algo
@@ -237,7 +231,7 @@ public:
AlgoChanwiseSmall chanwise_small; AlgoChanwiseSmall chanwise_small;
std::vector<AlgoGroupConvGeneral> gconv; std::vector<AlgoGroupConvGeneral> gconv;
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
std::vector<std::unique_ptr<AlgoBFloat16>> bfloat16_refhold;
AlgoBFloat16 bfloat16;


std::vector<AlgoBase*> std::vector<AlgoBase*>
//! all algorithms //! all algorithms


+ 55
- 37
dnn/src/cuda/convolution/backward_data/bfloat16.cpp View File

@@ -17,33 +17,39 @@ using namespace megdnn;
using namespace cuda; using namespace cuda;
using namespace convolution; using namespace convolution;


ConvolutionBackwardDataImpl::AlgoBFloat16::AlgoBFloat16(
ConvolutionBackwardDataImpl::AlgoBase* algorithm)
: m_algorithm(algorithm) {
megdnn_assert_internal(algorithm);
m_name = ssprintf("CONVOLUTION_BACKWARD_DATD_BFLOAT16:%s",
m_algorithm->name());
}

ConvolutionBackwardDataImpl::AlgoBase::SizeArgs
ConvolutionBackwardDataImpl::AlgoBFloat16::float_args(
const SizeArgs& args, ConvolutionBackwardDataImpl* opr,
TensorLayout& ffilter, TensorLayout& fdiff, TensorLayout& fgrad) const {
ffilter = *args.filter_layout;
fdiff = *args.diff_layout;
fgrad = *args.grad_layout;
namespace {
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> sub_opr_config(
const TensorLayoutArray& layouts,
const ConvolutionBackwardDataImpl* opr) {
megdnn_assert(layouts.size() >= 3);
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> ret;
ret.first = layouts;
auto change_dtype = [](TensorLayout& layout) { auto change_dtype = [](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) { if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32(); layout.dtype = dtype::Float32();
} }
}; };
change_dtype(ffilter);
change_dtype(fdiff);
change_dtype(fgrad);
opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_algorithm->info()};
return SizeArgs(opr, ffilter, fdiff, fgrad);
change_dtype(ret.first[0]);
change_dtype(ret.first[1]);
change_dtype(ret.first[2]);

ret.second = opr->param();
ret.second.compute_mode =
ConvolutionBackwardData::Param::ComputeMode::DEFAULT;
return ret;
}
}

std::vector<Algorithm::SearchItem>
ConvolutionBackwardDataImpl::AlgoBFloat16::get_subopr_list(
const TensorLayoutArray& layouts, const OperatorBase* opr) const {
auto&& config = sub_opr_config(
layouts, static_cast<const ConvolutionBackwardDataImpl*>(opr));

std::string param_str;
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str,
config.first}};
} }


bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available(
@@ -51,24 +57,30 @@ bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available(
TensorLayout ffilter, fdiff, fgrad; TensorLayout ffilter, fdiff, fgrad;
auto conv_back_data_opr = auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>(); args.handle->create_operator<ConvolutionBackwardData>();
SizeArgs fargs = float_args(
args,
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()),
ffilter, fdiff, fgrad);
auto&& config = sub_opr_config(
{*args.filter_layout, *args.diff_layout, *args.grad_layout},
args.opr);
conv_back_data_opr->param() = config.second;
return args.diff_layout->dtype == args.filter_layout->dtype && return args.diff_layout->dtype == args.filter_layout->dtype &&
args.diff_layout->dtype == dtype::BFloat16() && args.diff_layout->dtype == dtype::BFloat16() &&
m_algorithm->is_available(fargs);
get_algorithm(static_cast<ConvolutionBackwardDataImpl*>(
conv_back_data_opr.get()),
config.first[0], config.first[1], config.first[2]);
} }


WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle(
void* ptr, const SizeArgs& args) const { void* ptr, const SizeArgs& args) const {
TensorLayout ffilter, fdiff, fgrad;
auto conv_back_data_opr = auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>(); args.handle->create_operator<ConvolutionBackwardData>();
SizeArgs fargs = float_args(
args,
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()),
ffilter, fdiff, fgrad);
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_back_data_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config = sub_opr_config(
{*args.filter_layout, *args.diff_layout, *args.grad_layout},
args.opr);
conv_back_data_opr->param() = config.second;
SmallVector<size_t> sizes; SmallVector<size_t> sizes;
auto get_workspace = [&sizes](const TensorLayout& src, auto get_workspace = [&sizes](const TensorLayout& src,
const TensorLayout& dst) { const TensorLayout& dst) {
@@ -76,10 +88,12 @@ WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle(
sizes.push_back(dst.span().dist_byte()); sizes.push_back(dst.span().dist_byte());
} }
}; };
get_workspace(*args.filter_layout, ffilter);
get_workspace(*args.diff_layout, fdiff);
get_workspace(*args.grad_layout, fgrad);
sizes.push_back(m_algorithm->get_workspace_in_bytes(fargs));
get_workspace(*args.filter_layout, config.first[0]);
get_workspace(*args.diff_layout, config.first[1]);
get_workspace(*args.grad_layout, config.first[2]);

sizes.push_back(conv_back_data_opr->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2]));
return {ptr, std::move(sizes)}; return {ptr, std::move(sizes)};
} }


@@ -103,9 +117,13 @@ void ConvolutionBackwardDataImpl::AlgoBFloat16::exec(
{ {
auto conv_back_data_opr = auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>(); args.handle->create_operator<ConvolutionBackwardData>();
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_back_data_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
conv_back_data_opr->param() = args.opr->param(); conv_back_data_opr->param() = args.opr->param();
conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT; conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
conv_back_data_opr->execution_policy() = {m_algorithm->info()};
conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor, conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor,
cvter.workspace()); cvter.workspace());
} }


+ 2
- 2
dnn/src/cuda/convolution/backward_filter/bfloat16.cpp View File

@@ -42,7 +42,7 @@ ConvolutionBackwardFilterImpl::AlgoBFloat16::float_args(
change_dtype(fgrad); change_dtype(fgrad);
opr->param() = args.opr->param(); opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT; opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_algorithm->info()};
opr->execution_policy() = {m_algorithm->desc(), {}};
return SizeArgs(opr, fsrc, fdiff, fgrad); return SizeArgs(opr, fsrc, fdiff, fgrad);
} }


@@ -107,7 +107,7 @@ void ConvolutionBackwardFilterImpl::AlgoBFloat16::exec(
conv_back_filter_opr->param() = args.opr->param(); conv_back_filter_opr->param() = args.opr->param();
conv_back_filter_opr->param().compute_mode = conv_back_filter_opr->param().compute_mode =
Param::ComputeMode::DEFAULT; Param::ComputeMode::DEFAULT;
conv_back_filter_opr->execution_policy() = {m_algorithm->info()};
conv_back_filter_opr->execution_policy() = {m_algorithm->desc(), {}};
conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor, conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor,
cvter.workspace()); cvter.workspace());
} }


+ 3
- 3
dnn/src/cuda/convolution/opr_impl.cpp View File

@@ -69,7 +69,7 @@ ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
conv_param.dilate_h, conv_param.dilate_h,
conv_param.dilate_w, conv_param.dilate_w,
conv_param.compute_mode}; conv_param.compute_mode};
ret.convbias_opr->execution_policy() = {this->execution_policy().algo};
ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
return ret; return ret;
} }


@@ -102,7 +102,7 @@ ConvolutionForwardImpl::get_algorithm_from_desc(
conv_param.dilate_h, conv_param.dilate_h,
conv_param.dilate_w, conv_param.dilate_w,
conv_param.compute_mode}; conv_param.compute_mode};
convbias_opr->execution_policy() = {this->execution_policy().algo};
convbias_opr->execution_policy() = {this->execution_policy().algo, {}};


return static_cast<ConvBiasForwardImpl*>(convbias_opr.get()) return static_cast<ConvBiasForwardImpl*>(convbias_opr.get())
->get_algorithm_from_desc(desc); ->get_algorithm_from_desc(desc);
@@ -160,7 +160,7 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
_megdnn_tensor_out grad, _megdnn_tensor_out grad,
_megdnn_workspace workspace) { _megdnn_workspace workspace) {
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
auto algo = get_algorithm(this, filter.layout, args.filter_meta,
auto algo = get_algorithm(this, filter.layout,
diff.layout, grad.layout); diff.layout, grad.layout);
algo->check_workspace(args, workspace).exec(args); algo->check_workspace(args, workspace).exec(args);
} }


+ 11
- 0
dnn/src/cuda/convolution/opr_impl.h View File

@@ -83,6 +83,17 @@ public:
workspace_limit_in_bytes, reproducible) workspace_limit_in_bytes, reproducible)
->info(); ->info();
} }

AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) {
return get_algorithm_heuristic(filter, diff, grad,
workspace_limit_in_bytes, reproducible)
->info();
}

size_t get_workspace_in_bytes(const TensorLayout& filter, size_t get_workspace_in_bytes(const TensorLayout& filter,
const TensorLayout& diff, const TensorLayout& diff,
const TensorLayout& grad) override; const TensorLayout& grad) override;


+ 1
- 1
dnn/src/cuda/matrix_mul/bfloat16.cpp View File

@@ -82,7 +82,7 @@ void MatrixMulForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
args.opr->handle()->create_operator<MatrixMulForward>(); args.opr->handle()->create_operator<MatrixMulForward>();
matmul_opr->param() = args.opr->param(); matmul_opr->param() = args.opr->param();
matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT; matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
matmul_opr->execution_policy() = {m_algorithm->info()};
matmul_opr->execution_policy() = {m_algorithm->desc(), {}};
matmul_opr->exec(a, b, c, ctypecvt.workspace()); matmul_opr->exec(a, b, c, ctypecvt.workspace());
} }
ctypecvt.comp_to_dst_type(c, args.tensor_c); ctypecvt.comp_to_dst_type(c, args.tensor_c);


+ 3
- 4
dnn/src/fallback/conv_bias/opr_impl.cpp View File

@@ -1,6 +1,5 @@
/** /**
* \file dnn/src/fallback/conv_bias/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
g * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
* *
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
* *
@@ -367,7 +366,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
"should be equal"); "should be equal");
auto&& fm = check_layout_fwd(src, filter, dst); auto&& fm = check_layout_fwd(src, filter, dst);
auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm); auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm);
size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) size_t nr_threads = static_cast<naive::HandleImpl*>(handle())
->megcore_dispatcher() ->megcore_dispatcher()
->nr_threads(); ->nr_threads();
@@ -495,7 +494,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_from_desc(


ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm(
const NCBKernSizeParam& param, size_t workspace_size) { const NCBKernSizeParam& param, size_t workspace_size) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo; return algo;
} }
if (!m_prev_selected_algo || if (!m_prev_selected_algo ||


+ 2
- 2
dnn/src/fallback/convolution/opr_impl.cpp View File

@@ -387,7 +387,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_from_desc(


ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm( ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm(
const NCBKernSizeParam& param, size_t workspace_size) { const NCBKernSizeParam& param, size_t workspace_size) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo; return algo;
} }
if (!m_prev_selected_algo || if (!m_prev_selected_algo ||
@@ -783,7 +783,7 @@ ConvolutionBackwardDataImpl::get_algorithm_from_desc(


ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::Algorithm*
ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) { ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo; return algo;
} }
if (!m_prev_selected_algo || if (!m_prev_selected_algo ||


+ 1
- 1
dnn/src/fallback/matrix_mul/opr_impl.cpp View File

@@ -134,7 +134,7 @@ MatrixMul::Algorithm* MatrixMulImpl::get_algorithm_heuristic(
size_t workspace_limit_in_bytes, bool reproducible) { size_t workspace_limit_in_bytes, bool reproducible) {
auto kern_size_param = make_kern_size_param(A, B, C); auto kern_size_param = make_kern_size_param(A, B, C);
if (auto algo = static_cast<AlgoBase*>( if (auto algo = static_cast<AlgoBase*>(
get_algorithm_from_desc(execution_policy().algo.desc))) {
get_algorithm_from_desc(execution_policy().algo))) {
megdnn_assert(algo->get_workspace(kern_size_param) < megdnn_assert(algo->get_workspace(kern_size_param) <
workspace_limit_in_bytes); workspace_limit_in_bytes);
auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo, auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo,


+ 1
- 1
dnn/test/common/benchmarker.h View File

@@ -382,7 +382,7 @@ float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
for (auto i : algos) { for (auto i : algos) {
if (std::regex_match(i.name, if (std::regex_match(i.name,
std::regex("(" + algo_base + ")(.*)"))) { std::regex("(" + algo_base + ")(.*)"))) {
opr->execution_policy().algo = i;
opr->execution_policy().algo = i.desc;
auto used = benchmark.exec(layouts); auto used = benchmark.exec(layouts);
min_used = std::min(min_used, used); min_used = std::min(min_used, used);
printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(), printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(),


+ 72
- 28
dnn/test/common/checker.h View File

@@ -242,6 +242,11 @@ public:
return *this; return *this;
} }


Checker& reset_before_exec_callback() {
m_before_exec_callback = nullptr;
return *this;
}

//! set a tensors constraints function, for the purpose of manipulating //! set a tensors constraints function, for the purpose of manipulating
//! tensors when testing. //! tensors when testing.
Checker& set_tensors_constraint( Checker& set_tensors_constraint(
@@ -435,6 +440,17 @@ public:
Testcase operator=(const Testcase&) = delete; Testcase operator=(const Testcase&) = delete;
}; };


struct ExecutionPolicyAlgoName {
std::string name;
std::vector<ExecutionPolicyAlgoName> sub_policy_names;

ExecutionPolicyAlgoName(const char* name) : name{name} {}

ExecutionPolicyAlgoName(
const char* name,
const std::vector<ExecutionPolicyAlgoName>& sub_policy)
: name{name}, sub_policy_names{sub_policy} {}
};
/*! /*!
* \brief a callable to check that given algorithm is used for heuristic * \brief a callable to check that given algorithm is used for heuristic
* \param require_algo if its value is true, then requires * \param require_algo if its value is true, then requires
@@ -444,48 +460,76 @@ public:
*/ */
template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>> template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>>
class AlgoChecker { class AlgoChecker {
std::string m_name;
typename Opr::Algorithm* m_algo = nullptr;
bool* m_require_algo;

public: public:
AlgoChecker(const char* name, bool* require_algo = nullptr)
: m_name{name}, m_require_algo{require_algo} {}


AlgoChecker(typename Opr::Algorithm* algo, bool* require_algo = nullptr)
: m_algo{algo}, m_require_algo{require_algo} {}
AlgoChecker(ExecutionPolicyAlgoName name, bool* require_algo = nullptr)
: m_policy_name{name}, m_require_algo{require_algo} {}

AlgoChecker(ExecutionPolicy policy, bool* require_algo = nullptr)
: m_policy{policy}, m_require_algo{require_algo} {}

static ExecutionPolicy construct_execution_policy_from_name(
const ExecutionPolicyAlgoName& policy_name,
const TensorLayoutArray& layouts, const std::string& param,
Handle* handle) {
ExecutionPolicy ret;
megdnn_assert(layouts.size() == OprTrait<Opr>::arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);
for (auto algo_info :
AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info(
opr.get(), layouts)) {
if (std::regex_match(
algo_info.name,
std::regex("(" + policy_name.name + ")(.*)"))) {
ret.algo = algo_info.desc;
} else {
continue;
}

Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc);
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(layouts, opr.get());
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
ExecutionPolicy policy =
AlgoChecker<_Opr>::construct_execution_policy_from_name(
policy_name.sub_policy_names[_item_idx],
_item.layouts, _item.param, handle);
ret.sub_policy.push_back(policy);
});
return ret;
}
return ret;
}


void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) { void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) {
TensorLayoutArray layouts; TensorLayoutArray layouts;
for (auto&& val : arr) { for (auto&& val : arr) {
layouts.push_back(val.layout); layouts.push_back(val.layout);
} }
if (!m_policy_name.name.empty()) {
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
m_policy = construct_execution_policy_from_name(
m_policy_name, layouts, param_str, opr->handle());
ASSERT_TRUE(m_policy.algo.valid())
<< "algorithm " << m_policy_name.name << " not found";
}
if (m_require_algo && *m_require_algo) { if (m_require_algo && *m_require_algo) {
auto algo = auto algo =
OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts); OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts);
if (m_name.empty()) {
ASSERT_EQ(m_algo->name(), algo.name.c_str());
} else {
ASSERT_TRUE(std::regex_match(
algo.name.c_str(), std::regex("(" + m_name + ")(.*)")));
}
ASSERT_STREQ(opr->get_algorithm_from_desc(m_policy.algo)->name(),
algo.name.c_str());
} else { } else {
if (m_name.empty()) {
opr->execution_policy().algo = m_algo->info();
return;
} else {
for (auto i :
OprAlgoProxy::get_all_algorithms_info(opr, layouts)) {
if (std::regex_match(i.name,
std::regex("(" + m_name + ")(.*)"))) {
opr->execution_policy().algo = i;
return;
}
}
}
ASSERT_TRUE(false) << "algorithm " << m_name << " not found";
opr->execution_policy() = m_policy;
} }
} }

private:
ExecutionPolicyAlgoName m_policy_name;
ExecutionPolicy m_policy;
bool* m_require_algo;
}; };


} // namespace test } // namespace test


+ 3
- 3
dnn/test/common/convolution.cpp View File

@@ -580,7 +580,7 @@ void convolution::test_conv_config_combinations(int k_size,
checker.set_rng(0, &rng).set_rng(1, &rng); checker.set_rng(0, &rng).set_rng(1, &rng);
for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) {
used_algos.insert(algo.desc); used_algos.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker checker
.set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str()))
.execs({ishp, fshp, {}}); .execs({ishp, fshp, {}});
@@ -599,7 +599,7 @@ void convolution::test_conv_config_combinations(int k_size,
opr->param() = param; opr->param() = param;
for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) {
used_algos_bwd_data.insert(algo.desc); used_algos_bwd_data.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker_bwd_data checker_bwd_data
.set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str()))
.execl({fly, oly, ily}); .execl({fly, oly, ily});
@@ -620,7 +620,7 @@ void convolution::test_conv_config_combinations(int k_size,
opr->param() = param; opr->param() = param;
for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) {
used_algos_bwd_flt.insert(algo.desc); used_algos_bwd_flt.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker_bwd_filter checker_bwd_filter
.set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str()))
.execl({ily, oly, fly}); .execl({ily, oly, fly});


+ 47
- 0
dnn/test/common/fast_run_cache.cpp View File

@@ -0,0 +1,47 @@
/**
* \file dnn/test/common/fast_run_cache.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "test/common/fast_run_cache.h"
#include "src/common/utils.h"

using namespace megdnn;
using namespace test;

FastRunCache::SearchItemStorage::SearchItemStorage(
const Algorithm::SearchItem& item) {
Algorithm::serialize_write_pod(item.opr_type, data_hold);
for (auto&& layout : item.layouts) {
data_hold += layout.serialize();
}
data_hold += item.param;
}

Algorithm::Info::Desc FastRunCache::get(const Algorithm::SearchItem& key) {
SearchItemStorage key_storage(key);
key_storage.init_hash();

auto iter = m_cache.find(key_storage);
if (iter == m_cache.end()) {
return {};
}
return iter->second;
}

void FastRunCache::put(const Algorithm::SearchItem& key,
const Algorithm::Info::Desc& val) {
SearchItemStorage key_storage(key);
key_storage.init_hash();
megdnn_assert(m_cache.find(key_storage) == m_cache.end());
m_cache[std::move(key_storage)] = val;
}

// vim: syntax=cpp.doxygen

+ 58
- 0
dnn/test/common/fast_run_cache.h View File

@@ -0,0 +1,58 @@
/**
* \file dnn/test/common/fast_run_cache.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once

#include "megdnn/oprs.h"
#include "src/common/hash_ct.h"

#include <unordered_map>

namespace megdnn {
namespace test {
class FastRunCache {
struct SearchItemStorage {
std::string data_hold;
size_t hash = 0;

SearchItemStorage(const Algorithm::SearchItem& item);

SearchItemStorage& init_hash() {
hash = XXHash64CT::hash(data_hold.data(), data_hold.size(),
20201225);
return *this;
}

bool operator==(const SearchItemStorage& rhs) const {
return data_hold == rhs.data_hold;
}

struct Hash {
size_t operator()(const SearchItemStorage& s) const {
return s.hash;
}
};
};

std::unordered_map<SearchItemStorage, Algorithm::Info::Desc,
SearchItemStorage::Hash>
m_cache;

public:
Algorithm::Info::Desc get(const Algorithm::SearchItem& key);
void put(const Algorithm::SearchItem& key,
const Algorithm::Info::Desc& val);
};

} // namespace test
} // namespace megdnn

// vim: syntax=cpp.doxygen

+ 298
- 67
dnn/test/common/opr_proxy.h View File

@@ -13,6 +13,7 @@


#include "test/common/deduce_layout_proxy.h" #include "test/common/deduce_layout_proxy.h"
#include "test/common/exec_proxy.h" #include "test/common/exec_proxy.h"
#include "test/common/fast_run_cache.h"
#include "test/common/inspect_type.h" #include "test/common/inspect_type.h"
#include "test/common/opr_algo_proxy.h" #include "test/common/opr_algo_proxy.h"
#include "test/common/opr_trait.h" #include "test/common/opr_trait.h"
@@ -20,11 +21,104 @@
#include "test/common/workspace_wrapper.h" #include "test/common/workspace_wrapper.h"


#include <algorithm> #include <algorithm>
#include <limits>
#include <memory> #include <memory>
#include <unordered_map>


namespace megdnn { namespace megdnn {
namespace test { namespace test {


template <Algorithm::OprType>
struct OprFromOprTypeTrait;

template <typename Opr>
struct OprTypeFromOprTrait;

#define cb(_opr_type, _opr) \
template <> \
struct OprFromOprTypeTrait<Algorithm::OprType::_opr_type> { \
using Opr = megdnn::_opr; \
}; \
template <> \
struct OprTypeFromOprTrait<megdnn::_opr> { \
constexpr static Algorithm::OprType opr_type = \
Algorithm::OprType::_opr_type; \
}

cb(MATRIX_MUL_FORWARD, MatrixMulForward);
cb(CONVOLUTION_FORWARD, ConvolutionForward);
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData);
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter);
cb(CONVOLUTION3D_FORWARD, Convolution3DForward);
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData);
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter);
cb(LOCAL_SHARE_FORWARD, LocalShareForward);
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData);
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter);
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward);
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData);
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter);
cb(BATCH_CONV_FORWARD, BatchConvBiasForward);
cb(CONVBIAS_FORWARD, ConvBiasForward);

#undef cb

// clang-format off
#define FOREACH_OPR_TYPE(cb) \
cb(MATRIX_MUL_FORWARD) \
cb(CONVOLUTION_FORWARD) \
cb(CONVOLUTION_BACKWARD_DATA) \
cb(CONVOLUTION_BACKWARD_FILTER) \
cb(CONVOLUTION3D_FORWARD) \
cb(CONVOLUTION3D_BACKWARD_DATA) \
cb(CONVOLUTION3D_BACKWARD_FILTER) \
cb(LOCAL_SHARE_FORWARD) \
cb(LOCAL_SHARE_BACKWARD_DATA) \
cb(LOCAL_SHARE_BACKWARD_FILTER) \
cb(DEFORMABLE_CONV_FORWARD) \
cb(DEFORMABLE_CONV_BACKWARD_DATA) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER) \
cb(BATCH_CONV_FORWARD) \
cb(CONVBIAS_FORWARD)

#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \
cb(MATRIX_MUL_FORWARD, stmt) \
cb(CONVOLUTION_FORWARD, stmt) \
cb(CONVOLUTION_BACKWARD_DATA, stmt) \
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \
cb(CONVOLUTION3D_FORWARD, stmt) \
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \
cb(LOCAL_SHARE_FORWARD, stmt) \
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \
cb(DEFORMABLE_CONV_FORWARD, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \
cb(BATCH_CONV_FORWARD, stmt) \
cb(CONVBIAS_FORWARD, stmt)

// clang-format on

#define _OPR_TYPE_CASE(_opr_type, _stmt) \
case Algorithm::OprType::_opr_type: { \
using _Opr = typename OprFromOprTypeTrait< \
Algorithm::OprType::_opr_type>::Opr; \
_stmt; \
break; \
}

#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \
_item_idx++) { \
auto&& _item = _search_items[_item_idx]; \
switch (_item.opr_type) { \
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \
default: \
megdnn_throw("unknown opr_type"); \
} \
}

template <typename Opr, size_t arity = OprTrait<Opr>::arity, template <typename Opr, size_t arity = OprTrait<Opr>::arity,
bool has_workspace = OprTrait<Opr>::has_workspace, bool has_workspace = OprTrait<Opr>::has_workspace,
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout>
@@ -130,10 +224,11 @@ struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> {
}; };


//! OprProxy impl for tenary oprs with profiling support //! OprProxy impl for tenary oprs with profiling support
template <class Opr, int arity>
template <class Opr>
struct OprProxyProfilingBase struct OprProxyProfilingBase
: public DeduceLayoutProxy<Opr, arity,
: public DeduceLayoutProxy<Opr, OprTrait<Opr>::arity,
OprTrait<Opr>::can_deduce_layout> { OprTrait<Opr>::can_deduce_layout> {
static constexpr int arity = OprTrait<Opr>::arity;
size_t warmup_times = 10, exec_times = 100; size_t warmup_times = 10, exec_times = 100;


//! whether to enable profiling //! whether to enable profiling
@@ -142,7 +237,7 @@ struct OprProxyProfilingBase


//! target algo setup by profiler; it can also be directly specified by the //! target algo setup by profiler; it can also be directly specified by the
//! caller //! caller
typename Opr::AlgorithmInfo target_algo_info;
ExecutionPolicy target_execution_policy;


OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } OprProxyProfilingBase(bool profile = false) { m_profiling = profile; }


@@ -168,6 +263,154 @@ struct OprProxyProfilingBase
return ret; return ret;
} }


/**
* flatten search space in postorder traversal
* The subopr search construct a search tree
*
* A
* / \
* B1B2 C
* / \
* D1D2D3 E
* We use postorder traverse the search tree.
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A
*/
static std::vector<Algorithm::SearchItem> flatten_search_space(
const TensorLayoutArray layouts, const std::string& param,
Handle* handle) {
megdnn_assert(layouts.size() == arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);

std::vector<Algorithm::SearchItem> ret;
for (auto algo_info : AlgoProxy<Opr, arity>::get_all_algorithms_info(
opr.get(), layouts)) {
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc);
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(layouts, opr.get());

FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto space = OprProxyProfilingBase<_Opr>::flatten_search_space(
_item.layouts, _item.param, handle);
ret.insert(ret.end(), space.begin(), space.end());
});
}
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, param, layouts});
return ret;
}

static void construct_execution_policy(
const TensorLayoutArray& layouts, const std::string& param,
Handle* handle, FastRunCache& cache,
ExecutionPolicy& policy) {
megdnn_assert(layouts.size() == arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);
if (!policy.algo.valid()) {
policy.algo = cache.get(Algorithm::SearchItem{
OprTypeFromOprTrait<Opr>::opr_type, param, layouts});
megdnn_assert(policy.algo.valid(),
"No cache found, maybe some error occured in "
"flatten_search_space or get_subopr_list");
}
policy.sub_policy.clear();
Algorithm* algo = opr->get_algorithm_from_desc(policy.algo);
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(layouts, opr.get());
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
policy.sub_policy.push_back({});
OprProxyProfilingBase<_Opr>::construct_execution_policy(
_item.layouts, _item.param, handle, cache,
policy.sub_policy.back());
});
return;
}

/**
* \brief search and get the best execution_policy
*/
static void search(const TensorLayoutArray& layouts,
const std::string& param,
WorkspaceWrapper& workspace_wrapper, Handle* handle,
size_t warmup_times, size_t exec_times,
FastRunCache& cache) {
megdnn_assert(layouts.size() == arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);
SmallVector<size_t> sizes_in_bytes;
for (const auto& layout : layouts) {
sizes_in_bytes.push_back(layout.span().dist_byte());
}

float min_time = std::numeric_limits<float>::max();
Algorithm::Info::Desc best_algo;

std::string log_info = "Profiling start: ";
for (auto&& layout : layouts) {
log_info += layout.to_string() + " ";
}
megdnn_log("%s", log_info.c_str());
best_algo = cache.get(Algorithm::SearchItem{
OprTypeFromOprTrait<Opr>::opr_type, param, layouts});

if (best_algo.valid()) {
auto&& algo = opr->get_algorithm_from_desc(best_algo);
MEGDNN_MARK_USED_VAR(algo);
megdnn_log("Find best algo %s in cache", algo->name());
return;
}
for (auto algo : AlgoProxy<Opr, arity>::get_all_algorithms_info(
opr.get(), layouts)) {
//! construct execution_policy
opr->execution_policy().algo = algo.desc;
construct_execution_policy(layouts, param, handle, cache,
opr->execution_policy());

auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
opr.get(), layouts);
sizes_in_bytes.push_back(workspace_size);

WorkspaceBundle wb(nullptr, sizes_in_bytes);
workspace_wrapper.update(wb.total_size_in_bytes());
wb.set(workspace_wrapper.workspace().raw_ptr);
TensorNDArray tensors;
for (size_t i = 0; i < arity; i++) {
tensors.push_back({wb.get(i), layouts[i]});
}

for (size_t times = 0; times < warmup_times; ++times) {
AlgoProxy<Opr, arity>::exec(opr.get(), tensors,
wb.get_workspace(arity));
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < exec_times; ++times) {
AlgoProxy<Opr, arity>::exec(opr.get(), tensors,
wb.get_workspace(arity));
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
megdnn_log("%.3fms %s", timer.get_time_in_us() / 1e3,
algo.name.c_str());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
best_algo = algo.desc;
}

sizes_in_bytes.pop_back();
}
auto&& algo = opr->get_algorithm_from_desc(best_algo);
MEGDNN_MARK_USED_VAR(algo);
megdnn_log("Profiling end, got best algo: %s", algo->name());
cache.put(Algorithm::SearchItem{OprTypeFromOprTrait<Opr>::opr_type,
param, layouts},
best_algo);
}

void exec(Opr* opr, const TensorNDArray& tensors) { void exec(Opr* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == arity); megdnn_assert(tensors.size() == arity);
if (!W.valid()) { if (!W.valid()) {
@@ -177,39 +420,26 @@ struct OprProxyProfilingBase
for (auto&& tensor : tensors) { for (auto&& tensor : tensors) {
layouts.push_back(tensor.layout); layouts.push_back(tensor.layout);
} }
if (m_profiling && !target_algo_info.valid()) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
opr->execution_policy().algo = algo;
auto workspace_size =
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr,
layouts);
W.update(workspace_size);

for (size_t times = 0; times < warmup_times; ++times)
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < exec_times; ++times) {
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo.name.c_str());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
target_algo_info = algo;
}
}
opr->execution_policy().algo = target_algo_info;
if (m_profiling && !target_execution_policy.algo.valid()) {
FastRunCache cache;
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
auto&& search_items =
flatten_search_space(layouts, param_str, opr->handle());
FOREACH_OPR_TYPE_DISPATCH(search_items, {
OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W,
opr->handle(), warmup_times,
exec_times, cache);
});

construct_execution_policy(layouts, param_str, opr->handle(), cache,
opr->execution_policy());
target_execution_policy = opr->execution_policy();
auto workspace_size = auto workspace_size =
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
W.update(workspace_size); W.update(workspace_size);
} }
if (!target_algo_info.valid()) {
if (!target_execution_policy.algo.valid()) {
auto workspace_size = auto workspace_size =
AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts); AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
W.update(workspace_size); W.update(workspace_size);
@@ -218,30 +448,32 @@ struct OprProxyProfilingBase
} }
}; };


#define DEF_PROF(c, arity) \
template <> \
struct OprProxy<c> : public OprProxyProfilingBase<c, arity> { \
using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \
#define DEF_PROF(c) \
template <> \
struct OprProxy<c> : public OprProxyProfilingBase<c> { \
using OprProxyProfilingBase<c>::OprProxyProfilingBase; \
} }


DEF_PROF(ConvolutionForward, 3);
DEF_PROF(ConvolutionBackwardData, 3);
DEF_PROF(ConvolutionBackwardFilter, 3);
DEF_PROF(LocalShareForward, 3);
DEF_PROF(LocalShareBackwardData, 3);
DEF_PROF(LocalShareBackwardFilter, 3);
DEF_PROF(MatrixMulForward);
DEF_PROF(ConvolutionForward);
DEF_PROF(ConvolutionBackwardData);
DEF_PROF(ConvolutionBackwardFilter);
DEF_PROF(LocalShareForward);
DEF_PROF(LocalShareBackwardData);
DEF_PROF(LocalShareBackwardFilter);


DEF_PROF(DeformableConvForward, 5);
DEF_PROF(DeformableConvBackwardFilter, 5);
DEF_PROF(BatchConvBiasForward, 5);
DEF_PROF(ConvBiasForward, 5);
DEF_PROF(DeformableConvForward);
DEF_PROF(DeformableConvBackwardFilter);
DEF_PROF(BatchConvBiasForward);
DEF_PROF(ConvBiasForward);


DEF_PROF(DeformableConvBackwardData, 8);
DEF_PROF(DeformableConvBackwardData);
#undef DEF_PROF #undef DEF_PROF


template <class Opr, int arity>
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
using Base = OprProxyProfilingBase<Opr, arity>;
template <class Opr>
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> {
using Base = OprProxyProfilingBase<Opr>;
static constexpr int arity = OprTrait<Opr>::arity;
void exec(Opr* opr, const TensorNDArray& tensors) { void exec(Opr* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == arity); megdnn_assert(tensors.size() == arity);
if (!Base::W.valid()) { if (!Base::W.valid()) {
@@ -252,11 +484,11 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
for (auto&& tensor : tensors) { for (auto&& tensor : tensors) {
layouts.push_back(tensor.layout); layouts.push_back(tensor.layout);
} }
if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
if (Base::m_profiling && !Base::target_execution_policy.algo.valid()) {
size_t min_time = std::numeric_limits<size_t>::max(); size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo : for (auto algo :
AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) { AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;


auto preprocess_tensors = auto preprocess_tensors =
weight_prerocess(opr, tensors, algo.desc); weight_prerocess(opr, tensors, algo.desc);
@@ -288,12 +520,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
algo.name.c_str()); algo.name.c_str());
if (min_time > timer.get_time_in_us()) { if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us(); min_time = timer.get_time_in_us();
Base::target_algo_info = algo;
Base::target_execution_policy.algo = algo.desc;
} }
} }
opr->execution_policy().algo = Base::target_algo_info;
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo_info.desc);
opr->execution_policy() = Base::target_execution_policy;
auto preprocess_tensors = weight_prerocess(
opr, tensors, Base::target_execution_policy.algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle()); megcoreSynchronize(opr->handle()->megcore_computing_handle());
typename Opr::PreprocessedFilter preprocessed_filter{ typename Opr::PreprocessedFilter preprocessed_filter{
nullptr, *preprocess_tensors}; nullptr, *preprocess_tensors};
@@ -301,12 +533,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
opr, layouts, &preprocessed_filter); opr, layouts, &preprocessed_filter);
Base::W.update(workspace_size); Base::W.update(workspace_size);
} }
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo_info.desc);
auto preprocess_tensors = weight_prerocess(
opr, tensors, Base::target_execution_policy.algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle()); megcoreSynchronize(opr->handle()->megcore_computing_handle());
typename Opr::PreprocessedFilter preprocessed_filter{ typename Opr::PreprocessedFilter preprocessed_filter{
nullptr, *preprocess_tensors}; nullptr, *preprocess_tensors};
if (!Base::target_algo_info.valid()) {
if (!Base::target_execution_policy.algo.valid()) {
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
opr, layouts, &preprocessed_filter); opr, layouts, &preprocessed_filter);
Base::W.update(workspace_size); Base::W.update(workspace_size);
@@ -342,16 +574,15 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
} }
}; };


#define DEF_PROF(c, arity) \
template <> \
struct OprWeightPreprocessProxy<c> \
: public OprWeightPreprocessProxyImpl<c, arity> { \
using OprWeightPreprocessProxyImpl< \
c, arity>::OprWeightPreprocessProxyImpl; \
#define DEF_PROF(c) \
template <> \
struct OprWeightPreprocessProxy<c> \
: public OprWeightPreprocessProxyImpl<c> { \
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \
} }


DEF_PROF(ConvolutionForward, 3);
DEF_PROF(ConvBias, 5);
DEF_PROF(ConvolutionForward);
DEF_PROF(ConvBias);
#undef DEF_PROF #undef DEF_PROF


} // namespace test } // namespace test


+ 1
- 1
dnn/test/cuda/batch_conv_bias.cpp View File

@@ -279,7 +279,7 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,


benchmarker.set_param(bparam); benchmarker.set_param(bparam);
if (!algo) { if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
} }
auto time_in_ms = auto time_in_ms =
benchmarker.execs( benchmarker.execs(


+ 6
- 6
dnn/test/cuda/chanwise_convolution.cpp View File

@@ -514,7 +514,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {


auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) { size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
}; };


@@ -538,7 +538,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {


auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) { size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{C, 1, 1, FH, FW}, checker.execs({{C, 1, 1, FH, FW},
{N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH - FH + 1, IW - FW + 1},
{N, C, IH, IW}}); {N, C, IH, IW}});
@@ -564,7 +564,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {


auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) { size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{N, C, IH, IW}, checker.execs({{N, C, IH, IW},
{N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH - FH + 1, IW - FW + 1},
{C, 1, 1, FH, FW}}); {C, 1, 1, FH, FW}});
@@ -614,7 +614,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;


bencher.set_param(param) bencher.set_param(param)
@@ -623,10 +623,10 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float16()) .set_dtype(2, dtype::Float16())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;


bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param); bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;


+ 2
- 2
dnn/test/cuda/conv_bias_int8.cpp View File

@@ -168,7 +168,7 @@ void benchmark_target_algo(


benchmarker.set_param(param); benchmarker.set_param(param);
if (!algo) { if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
} }
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
@@ -327,7 +327,7 @@ void benchmark_target_algo_with_cudnn_tsc(


benchmarker.set_param(param); benchmarker.set_param(param);
if (!algo) { if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
} }
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},


+ 95
- 30
dnn/test/cuda/convolution.cpp View File

@@ -8,6 +8,7 @@
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/ */
#include "megdnn/dtype.h"
#include "megdnn/oprs.h" #include "megdnn/oprs.h"
#include "megdnn/opr_param_defs.h" #include "megdnn/opr_param_defs.h"
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
@@ -223,14 +224,19 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
.set_epsilon(1e-1) .set_epsilon(1e-1)
.set_param(arg.param) .set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src}); .exec(TensorLayoutArray{filter, dst, src});
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
checker.
set_rng(0, &rng).
set_rng(1, &rng).
set_epsilon(1e-1).
set_param(arg.param).
exec(TensorLayoutArray{filter, dst, src});
} }
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
{{"MATMUL", {}}}}));
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-1)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
checker.reset_before_exec_callback();
checker.opr()->execution_policy() = {};
} }
} }


@@ -382,32 +388,35 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {


#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, CONV_FWD_BENCHMARK) { TEST_F(CUDA, CONV_FWD_BENCHMARK) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH=1,
size_t SW=1, size_t FH=1, size_t FW=1, size_t PH=0, size_t PW=0, bool fp16io_c32=false) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda()); auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
benchmarker.set_dtype(0, dtype::Float16()) benchmarker.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
ConvolutionForward::Param param; ConvolutionForward::Param param;
param.stride_h = SH; param.stride_h = SH;
param.stride_w = SW; param.stride_w = SW;
param.pad_h = PH; param.pad_h = PH;
param.pad_w = PW; param.pad_w = PW;
if (fp16io_c32) { if (fp16io_c32) {
param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
param.compute_mode =
ConvolutionForward::Param::ComputeMode::FLOAT32;
} }
benchmarker.set_param(param); benchmarker.set_param(param);
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
benchmarker.set_proxy(proxy); benchmarker.set_proxy(proxy);
size_t OH = (IH - FH + 2 * PH) / SH + 1; size_t OH = (IH - FH + 2 * PH) / SH + 1;
size_t OW = (IW - FW + 2 * PW) / SW + 1; size_t OW = (IW - FW + 2 * PW) / SW + 1;
auto time = benchmarker.execs({
{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
auto time = benchmarker.execs(
{{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
time /= 1000.0 * 10.0; time /= 1000.0 * 10.0;
auto flo = (double) N * OC * IC * OH * OW * FH * FW * 2;
auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
auto flops = flo / time / 1e12; auto flops = flo / time / 1e12;
printf("comp_type %s: ", fp16io_c32 ? "32" : "16"); printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo/1e9, flops);
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
}; };
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false); run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true); run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
@@ -415,7 +424,8 @@ TEST_F(CUDA, CONV_FWD_BENCHMARK) {


TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
CUBenchmarker<ConvolutionForward> bench{handle_cuda()}; CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
size_t RUNS = 10; size_t RUNS = 10;
bench.set_proxy(proxy).set_times(RUNS); bench.set_proxy(proxy).set_times(RUNS);


@@ -429,7 +439,7 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
param.pad_h = param.pad_w = PH; param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param); bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()}; filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst; TensorLayout dst;
@@ -440,13 +450,13 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
} }
auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS; auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16()) bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16()) .set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16()); .set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS; auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param); bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS; auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
@@ -500,7 +510,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
param.pad_h = param.pad_w = PH; param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param); bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()}; filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst; TensorLayout dst;
@@ -511,13 +521,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
} }
auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS; auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16()) bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16()) .set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16()); .set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS; auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param); bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS; auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
@@ -554,6 +564,62 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
run(32, 64, 64, 56, 56, 1, 1, 0); run(32, 64, 64, 56, 56, 1, 1, 0);
} }


TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
new OprProxy<ConvolutionBackwardData>{true}};
size_t RUNS = 10;
bench.set_proxy(proxy).set_times(RUNS);

auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
size_t FH, size_t SH, size_t PH) {
bench.set_dtype(0, dtype::BFloat16())
.set_dtype(1, dtype::BFloat16())
.set_dtype(2, dtype::BFloat16());
param::Convolution param;
param.stride_h = param.stride_w = SH;
param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param);
bench.proxy()->target_execution_policy = {};
TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
filter{{OC, IC, FH, FH}, dtype::BFloat16()};
TensorLayout dst;
{
auto&& opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(src, filter, dst);
}
auto used = bench.execl({filter, dst, src}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
filter.to_string().c_str(), dst.to_string().c_str());
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
(flo / (used * 1e9)));
};
run(32, 64, 3, 224, 224, 7, 2, 3);
run(32, 128, 128, 28, 28, 3, 1, 1);
run(32, 256, 256, 14, 14, 3, 1, 1);
run(32, 512, 512, 7, 7, 3, 1, 1);
run(32, 64, 64, 56, 56, 3, 1, 1);
run(32, 512, 256, 56, 56, 1, 2, 0);
run(32, 1024, 512, 28, 28, 1, 2, 0);
run(32, 2048, 1024, 14, 14, 1, 2, 0);
run(32, 512, 128, 28, 28, 1, 1, 0);
run(32, 128, 512, 28, 28, 1, 1, 0);
run(32, 1024, 256, 14, 14, 1, 1, 0);
run(32, 256, 1024, 14, 14, 1, 1, 0);
run(32, 2048, 512, 7, 7, 1, 1, 0);
run(32, 512, 2048, 7, 7, 1, 1, 0);
run(32, 256, 64, 56, 56, 1, 1, 0);
run(32, 64, 256, 56, 56, 1, 1, 0);
run(32, 128, 256, 56, 56, 1, 2, 0);
run(32, 256, 512, 28, 28, 1, 2, 0);
run(32, 512, 1024, 14, 14, 1, 2, 0);
run(32, 64, 64, 56, 56, 1, 1, 0);
}


TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()}; CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
@@ -571,7 +637,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
param.pad_h = param.pad_w = PH; param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT; param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param); bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()}; filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst; TensorLayout dst;
@@ -582,13 +648,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
} }
auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16(); src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16()) bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16()) .set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16()); .set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS; auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param); bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS; auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
@@ -630,8 +696,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
#undef V #undef V
#undef V1 #undef V1



} // namespace test
} // namespace megdnn
} // namespace test
} // namespace megdnn


// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen

+ 9
- 9
dnn/test/cuda/local_share.cpp View File

@@ -778,7 +778,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS; auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS;


printf("src=%s, diff=%s, grad=%s, float32: %.2fms " printf("src=%s, diff=%s, grad=%s, float32: %.2fms "
@@ -856,7 +856,7 @@ TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
; ;


@@ -915,7 +915,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS; auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS;


printf("filter=%s, diff=%s, grad=%s, float32: %.2fms " printf("filter=%s, diff=%s, grad=%s, float32: %.2fms "
@@ -1002,11 +1002,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;


bencher_conv.set_param(conv_param); bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv = auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;


@@ -1094,11 +1094,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;


bencher_conv.set_param(conv_param); bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv = auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;


@@ -1177,11 +1177,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) {
.set_dtype(2, dtype::Float32()) .set_dtype(2, dtype::Float32())
.set_rng(0, &rng) .set_rng(0, &rng)
.set_rng(1, &rng); .set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;


bencher_conv.set_param(conv_param); bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv = auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;




+ 23
- 1
dnn/test/gtest_main.cpp View File

@@ -10,6 +10,7 @@
*/ */


#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "megdnn/basic_types.h"
#include "src/common/utils.h" #include "src/common/utils.h"
#include "test/common/random_state.h" #include "test/common/random_state.h"


@@ -21,9 +22,29 @@ class ResetSeedListener : public ::testing::EmptyTestEventListener {
} }
}; };


megdnn::LogLevel min_log_level;

void set_log_level() {
megdnn::LogLevel level = megdnn::LogLevel::INFO;
auto setting = std::getenv("MEGDNN_LOG_LEVEL");
if (setting) {
if (!strcmp(setting, "INFO")) {
level = megdnn::LogLevel::INFO;
} else if (!strcmp(setting, "DEBUG")) {
level = megdnn::LogLevel::DEBUG;
} else if (!strcmp(setting, "WARN")) {
level = megdnn::LogLevel::WARN;
} else {
megdnn_assert(!strcmp(setting, "ERROR"));
level = megdnn::LogLevel::ERROR;
}
}
min_log_level = level;
}

void log_handler(megdnn::LogLevel level, const char* file, const char* func, void log_handler(megdnn::LogLevel level, const char* file, const char* func,
int line, const char* fmt, va_list ap) { int line, const char* fmt, va_list ap) {
if (level < megdnn::LogLevel::ERROR) {
if (level < min_log_level) {
return; return;
} }
char msg[1024]; char msg[1024];
@@ -39,6 +60,7 @@ void log_handler(megdnn::LogLevel level, const char* file, const char* func,


extern "C" int gtest_main(int argc, char** argv) { extern "C" int gtest_main(int argc, char** argv) {
::megdnn::set_log_handler(log_handler); ::megdnn::set_log_handler(log_handler);
set_log_level();
ResetSeedListener listener; ResetSeedListener listener;
auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); auto&& listeners = ::testing::UnitTest::GetInstance()->listeners();
::testing::InitGoogleTest(&argc, argv); ::testing::InitGoogleTest(&argc, argv);


+ 1
- 0
dnn/test/x86/convolution.cpp View File

@@ -450,6 +450,7 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
} }
} }
#endif #endif

#endif #endif


} // namespace test } // namespace test


+ 9
- 8
src/core/test/graph/misc.cpp View File

@@ -27,6 +27,7 @@
#include "megbrain/gopt/inference.h" #include "megbrain/gopt/inference.h"


#include "megbrain/test/helper.h" #include "megbrain/test/helper.h"
#include "megdnn/oprs/base.h"


#include <atomic> #include <atomic>
#include <chrono> #include <chrono>
@@ -1924,19 +1925,19 @@ TEST(TestGraph, NaiveRecord2NCHW44) {


namespace { namespace {
template <typename DnnOp, typename... Args> template <typename DnnOp, typename... Args>
typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
Args&& ...args) { Args&& ...args) {
if (found.valid()) { if (found.valid()) {
if (found.val()) { if (found.val()) {
return dnn_op->execution_policy().algo;
return dnn_op->execution_policy();
} else { } else {
return {}; return {};
} }
} }
for (auto&& algo : dnn_op->get_all_algorithms_info( for (auto&& algo : dnn_op->get_all_algorithms_info(
std::forward<Args>(args)...)) { std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo;
dnn_op->execution_policy().algo = algo.desc;
auto layouts = dnn_op->deduce_preprocessed_filter_layout( auto layouts = dnn_op->deduce_preprocessed_filter_layout(
std::forward<Args>(args)...); std::forward<Args>(args)...);
if (layouts.empty()) continue; if (layouts.empty()) continue;
@@ -1949,7 +1950,7 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
} }
if (valid) { if (valid) {
found.emplace(true); found.emplace(true);
return algo;
return {algo.desc, {}};
} }
} }
found.emplace(false); found.emplace(false);
@@ -1958,19 +1959,19 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
} }


template <typename DnnOp, typename... Args> template <typename DnnOp, typename... Args>
typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo(
typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
Args&& ...args) { Args&& ...args) {
if (found.valid()) { if (found.valid()) {
if (found.val()) { if (found.val()) {
return dnn_op->execution_policy().algo;
return dnn_op->execution_policy();
} else { } else {
return {}; return {};
} }
} }
for (auto&& algo : dnn_op->get_all_algorithms_info( for (auto&& algo : dnn_op->get_all_algorithms_info(
std::forward<Args>(args)...)) { std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo;
dnn_op->execution_policy().algo = algo.desc;
auto layouts = dnn_op->deduce_preprocessed_filter_layout( auto layouts = dnn_op->deduce_preprocessed_filter_layout(
std::forward<Args>(args)...); std::forward<Args>(args)...);
if (layouts.size() <= 1) if (layouts.size() <= 1)
@@ -1981,7 +1982,7 @@ typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo(
} }
if (valid) { if (valid) {
found.emplace(true); found.emplace(true);
return algo;
return {algo.desc, {}};
} }
} }
found.emplace(false); found.emplace(false);


+ 395
- 142
src/opr/impl/search_policy/algo_chooser.cpp View File

@@ -11,6 +11,7 @@
*/ */


#include "megbrain/opr/search_policy/algo_chooser.h" #include "megbrain/opr/search_policy/algo_chooser.h"
#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/search_policy/profiler.h"


@@ -21,6 +22,7 @@
//! TODO: here has to be know some megdnn::opr when there is produced midout.h //! TODO: here has to be know some megdnn::opr when there is produced midout.h
//! fix it if there is another graceful way. //! fix it if there is another graceful way.
#include "megdnn/oprs.h" #include "megdnn/oprs.h"
#include "megdnn/oprs/base.h"
#include "midout.h" #include "midout.h"
MIDOUT_DECL(megbrain_opr_algo_chooser) MIDOUT_DECL(megbrain_opr_algo_chooser)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) {
@@ -29,6 +31,8 @@ MIDOUT_DECL(megbrain_opr_algo_chooser)
MIDOUT_END(); MIDOUT_END();


using mgb::opr::intl::WorkspaceLimitGetter; using mgb::opr::intl::WorkspaceLimitGetter;
using namespace megdnn;
using namespace mgb;


#define APPLY(statement, ...) \ #define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \ mgb::apply([&](const auto&... args) { return statement; }, \
@@ -37,7 +41,7 @@ using mgb::opr::intl::WorkspaceLimitGetter;
// timeout delta to be added with fastest known algorithm for new algos // timeout delta to be added with fastest known algorithm for new algos
constexpr double TIMEOUT_TOLERANCE = 2; constexpr double TIMEOUT_TOLERANCE = 2;


#define CACHE_KEY_VERSION "v3"
#define CACHE_KEY_VERSION "v4"


namespace { namespace {
template <typename Opr> template <typename Opr>
@@ -48,44 +52,191 @@ std::string profile_name(Opr* opr) {
ret.append(opr->get_algorithm_set_name()); ret.append(opr->get_algorithm_set_name());
return ret; return ret;
} }

template <typename Opr>
std::string format_fixlayouts(
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts,
size_t arity_in, size_t arity_out) {
std::string ret;
ret.append(": tensor layouts(");
for (size_t i = 0; i < arity_in; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i].to_string() + " ");
ret.append(layouts[i].dtype.name());
}
ret.append(") -> (");
for (size_t i = 0; i < arity_out; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i + arity_in].to_string() + " ");
ret.append(layouts[i + arity_in].dtype.name());
}
return ret;
}

///////////////// OprTypeTrait /////////////////////////////
template <megdnn::Algorithm::OprType>
struct OprFromOprTypeTrait;

template <typename Opr>
struct OprTypeFromOprTrait;

#define cb(_opr_type, _opr) \
template <> \
struct OprFromOprTypeTrait<megdnn::Algorithm::OprType::_opr_type> { \
using Opr = megdnn::_opr; \
}; \
template <> \
struct OprTypeFromOprTrait<megdnn::_opr> { \
constexpr static megdnn::Algorithm::OprType opr_type = \
megdnn::Algorithm::OprType::_opr_type; \
}

cb(MATRIX_MUL_FORWARD, MatrixMulForward);
cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward);
cb(CONVOLUTION_FORWARD, ConvolutionForward);
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData);
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter);
cb(CONVOLUTION3D_FORWARD, Convolution3DForward);
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData);
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter);
cb(LOCAL_SHARE_FORWARD, LocalShareForward);
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData);
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter);
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward);
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData);
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter);
cb(BATCH_CONV_FORWARD, BatchConvBiasForward);
cb(CONVBIAS_FORWARD, ConvBiasForward);

#undef cb

// clang-format off
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \
cb(MATRIX_MUL_FORWARD, stmt) \
cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \
cb(CONVOLUTION_FORWARD, stmt) \
cb(CONVOLUTION_BACKWARD_DATA, stmt) \
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \
cb(CONVOLUTION3D_FORWARD, stmt) \
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \
cb(LOCAL_SHARE_FORWARD, stmt) \
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \
cb(DEFORMABLE_CONV_FORWARD, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \
cb(BATCH_CONV_FORWARD, stmt) \
cb(CONVBIAS_FORWARD, stmt)
// clang-format on

#define _OPR_TYPE_CASE(_opr_type, _stmt) \
case Algorithm::OprType::_opr_type: { \
using _Opr = typename OprFromOprTypeTrait< \
Algorithm::OprType::_opr_type>::Opr; \
_stmt; \
break; \
}

#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \
for (size_t _item_idx = 0; _item_idx < _search_items.size(); \
_item_idx++) { \
auto&& _item = _search_items[_item_idx]; \
switch (_item.opr_type) { \
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \
default: \
mgb_throw(MegBrainError, "unknown opr_type"); \
} \
}

template <typename Opr>
TensorLayoutArray to_layout_array(
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts) {
TensorLayoutArray ret;
for (auto&& layout : layouts) {
ret.push_back(layout);
}
return ret;
} }


template <typename Opr>
typename opr::AlgoChooser<Opr>::FixedTensorLayouts to_fixed_layouts(
const TensorLayoutArray& layouts) {
typename opr::AlgoChooser<Opr>::FixedTensorLayouts ret;
mgb_assert(ret.size() == layouts.size());
size_t idx = 0;
for (auto&& layout : layouts) {
ret[idx++] = layout;
}
return ret;
}

} // namespace

namespace mgb { namespace mgb {
namespace opr { namespace opr {


template <typename Opr> template <typename Opr>
AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result(
ExeContext& ctx, bool enable_update) {
AlgoChooserProfileCache cache(ctx.mgb_opr()->comp_node(),
profile_name(ctx.megdnn_opr()).c_str());

TensorLayoutArray origin_layouts = ctx.layouts();
typename Opr::Param origin_param = ctx.mgb_opr()->param();
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(),
origin_layouts.size(), &origin_param,
sizeof(origin_param)};
{
auto&& rst = cache.get(cache_key);
if (rst.valid())
return rst.val();
std::vector<megdnn::Algorithm::SearchItem>
AlgoChooser<Opr>::flatten_search_space(const ExeContext& ctx) {
std::vector<megdnn::Algorithm::SearchItem> ret;
for (auto algo_info : ctx.get_all_candidates()) {
megdnn::Algorithm* algo = ctx.get_algorithm_from_desc(algo_info.desc);
mgb_assert(algo, "Unknown algo description");
std::vector<megdnn::Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(to_layout_array<Opr>(ctx.layouts()),
ctx.megdnn_opr());

FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node());
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(
_item.param);
typename AlgoChooser<_Opr>::ExeContext sub_ctx(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, ctx.mgb_opr(), ctx.comp_node(),
ctx.execution_policy(), ctx.allow_weight_preprocess());
auto space = AlgoChooser<_Opr>::flatten_search_space(sub_ctx);
ret.insert(ret.end(), space.begin(), space.end());
});
} }
ret.push_back({OprTypeFromOprTrait<Opr>::opr_type, ctx.param(),
to_layout_array<Opr>(ctx.layouts())});
return ret;
}


template <typename Opr>
void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
if (ctx.get_profile_result_from_cache(require_reproducible).valid())
return;
AlgoChooserProfileCache::Result prof_rst; AlgoChooserProfileCache::Result prof_rst;
if (!enable_update)
return prof_rst;


std::string str_on_inp_shape = ssprintf( std::string str_on_inp_shape = ssprintf(
"on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(), "on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(),
ctx.layouts()[1].to_string().c_str()); ctx.layouts()[1].to_string().c_str());
double cur_timeout = 0; double cur_timeout = 0;

auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
ctx.owner_graph(), ctx.comp_node(),
ctx.execution_policy().workspace_limit);
RealTimer timer; RealTimer timer;
for (auto algo : ctx.get_all_candidates_with_workspace_limit()) {
for (auto algo : ctx.get_all_candidates()) {
Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst; Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst;
std::string msg = ssprintf("profiling %s algorithm %s %s", std::string msg = ssprintf("profiling %s algorithm %s %s",
ctx.mgb_opr()->dyn_typeinfo()->name, ctx.mgb_opr()->dyn_typeinfo()->name,
algo.name.c_str(), str_on_inp_shape.c_str()); algo.name.c_str(), str_on_inp_shape.c_str());
ImplExecutionPolicy policy;
policy.algo = algo.desc;
ctx.construct_execution_policy_from_cache(require_reproducible, policy);
if (ctx.get_workspace_size_bytes(policy) >= workspace_limit)
continue;

timer.reset(); timer.reset();
MGB_TRY { cur_rst = ctx.profile_single_algo(algo, cur_timeout); }
MGB_TRY { cur_rst = ctx.profile_single_algo(policy, cur_timeout); }
MGB_CATCH(std::exception & exc, { MGB_CATCH(std::exception & exc, {
mgb_log_warn("caught exception during %s: %s", msg.c_str(), mgb_log_warn("caught exception during %s: %s", msg.c_str(),
exc.what()); exc.what());
@@ -114,120 +265,100 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result(
mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s", mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s",
str_on_inp_shape.c_str()); str_on_inp_shape.c_str());


FixedTensorLayouts origin_layouts = ctx.layouts();
typename Opr::Param origin_param = ctx.megdnn_opr()->param();
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(),
origin_layouts.size(), &origin_param,
sizeof(origin_param)};

AlgoChooserProfileCache cache(ctx.comp_node(),
profile_name(ctx.megdnn_opr()).c_str());
cache.put(cache_key, prof_rst); cache.put(cache_key, prof_rst);
return prof_rst;
} }


template <typename Opr> template <typename Opr>
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::choose_by_profile(
ExeContext& ctx, bool require_reproducible, bool enable_update) {
typename AlgoChooser<Opr>::ImplExecutionPolicy
AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
bool enable_update) {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile"))) MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile")))
auto opr = ctx.mgb_opr();
if (opr->owner_graph()->options().no_profiling_on_shape_change) {
auto algo = ctx.megdnn_opr()->execution_policy().algo;
if (algo.valid())
return algo;
if (ctx.owner_graph()->options().no_profiling_on_shape_change) {
auto policy = ctx.megdnn_opr()->execution_policy();
if (policy.algo.valid())
return policy;
} }


std::unordered_map<std::string, ImplAlgo> algo_map;
for (auto i : ctx.get_all_candidates()) {
auto ins = algo_map.emplace(i.name.c_str(), i);
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str());
if (enable_update) {
auto&& search_items = flatten_search_space(ctx);
FOREACH_OPR_TYPE_DISPATCH(search_items, {
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node());
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(
_item.param);
typename AlgoChooser<_Opr>::ExeContext sub_ctx(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, ctx.mgb_opr(), ctx.comp_node(),
ctx.execution_policy(), ctx.allow_weight_preprocess());
AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible);
});
} }

auto&& prof = get_profile_result(ctx, enable_update);
if (prof.empty())
return {};
for (auto&& i : prof) {
if ((!require_reproducible || i.reproducible)) {
auto iter = algo_map.find(i.algo);
mgb_assert(iter != algo_map.end(),
"algorithm %s exists in "
"profiling result but not in algo_map; please "
"report this "
"bug; opr: %s{%s}, shapes: %s %s %s",
i.algo.c_str(),
ctx.mgb_opr()->cname(),
ctx.mgb_opr()->dyn_typeinfo()->name,
ctx.layouts()[0].TensorShape::to_string().c_str(),
ctx.layouts()[1].TensorShape::to_string().c_str(),
ctx.layouts()[2].TensorShape::to_string().c_str());
return iter->second;
}
}

mgb_log_error(
"Workspace requirement (%zu) could not be satisfied. Abort now "
"to "
"avoid further problems",
WorkspaceLimitGetter::get_workspace_limit(
opr->owner_graph(), opr->comp_node(),
opr->execution_policy().workspace_limit));
mgb_trap();
typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
ctx.construct_execution_policy_from_cache(require_reproducible, policy);
return policy;
MIDOUT_E MIDOUT_E
} }


template <typename Opr> template <typename Opr>
size_t AlgoChooser<Opr>::setup_algo(const TensorLayoutArray& layouts,
size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
Opr* megdnn_opr, const MGBOpr* mgb_opr, Opr* megdnn_opr, const MGBOpr* mgb_opr,
bool allow_weight_preprocess) { bool allow_weight_preprocess) {
if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) {
return 0; return 0;
} }


ImplAlgo algo = {};
ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess);
std::string param_str;
Algorithm::serialize_write_pod(megdnn_opr->param(), param_str);
ExeContext ctx(layouts, megdnn_opr, param_str, mgb_opr,
mgb_opr->comp_node(), mgb_opr->execution_policy(),
allow_weight_preprocess);


ImplExecutionPolicy policy;
if (auto algo_choose_hook = mgb_opr->algo_chooser()) { if (auto algo_choose_hook = mgb_opr->algo_chooser()) {
algo = algo_choose_hook(mgb_opr);
policy = algo_choose_hook(mgb_opr);
} }
if (!algo.valid()) {
algo = get_algo(ctx);
if (!policy.algo.valid()) {
policy = get_policy(ctx);
} }
size_t workspace = ctx.get_workspace_size_bytes(algo);
size_t workspace = ctx.get_workspace_size_bytes(policy);


std::string ret; std::string ret;
ret.append(mgb_opr->dyn_typeinfo()->name); ret.append(mgb_opr->dyn_typeinfo()->name);
ret.append(": tensor layouts(");
for (size_t i = 0; i < arity_in; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i].to_string() + " ");
ret.append(layouts[i].dtype.name());
}
ret.append(") -> (");
for (size_t i = 0; i < arity_out; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i + arity_in].to_string() + " ");
ret.append(layouts[i + arity_in].dtype.name());
}
ret.append("): algo=" + algo.name);
ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out);
Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(palgo, "Unknown algo description");
ret.append("): algo=" + std::string(palgo->name()));
ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d", ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d",
workspace / (1024 * 1024.0), algo.is_reproducible));
workspace / (1024 * 1024.0), palgo->is_reproducible()));
mgb_log_debug("%s", ret.c_str()); mgb_log_debug("%s", ret.c_str());


megdnn_opr->execution_policy() = {algo};
megdnn_opr->execution_policy() = policy;
return workspace; return workspace;
} }


template <typename Opr> template <typename Opr>
typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo(
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy(
ExeContext& ctx) { ExeContext& ctx) {
using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE); MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE);
switch (ctx.mgb_opr()->execution_policy().strategy) {
switch (ctx.execution_policy().strategy) {
case S::HEURISTIC: case S::HEURISTIC:
return ctx.choose_by_heuristic(); return ctx.choose_by_heuristic();
case S::HEURISTIC_REPRODUCIBLE: case S::HEURISTIC_REPRODUCIBLE:
return ctx.choose_by_heuristic(true); return ctx.choose_by_heuristic(true);
case S::PROFILE_HEURISTIC: { case S::PROFILE_HEURISTIC: {
ImplAlgo algo = choose_by_profile(ctx, false, false);
if (!algo.valid())
algo = ctx.choose_by_heuristic();
return algo;
ImplExecutionPolicy policy = choose_by_profile(ctx, false, false);
if (!policy.algo.valid())
policy = ctx.choose_by_heuristic();
return policy;
} }
#if MGB_ENABLE_FASTRUN #if MGB_ENABLE_FASTRUN
case S::PROFILE: case S::PROFILE:
@@ -241,16 +372,17 @@ typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::get_algo(
} }


#define INST(Opr) \ #define INST(Opr) \
template AlgoChooser<megdnn::Opr>::ImplAlgo \
AlgoChooser<megdnn::Opr>::get_algo(ExeContext& ctx); \
template AlgoChooserProfileCache::Result \
AlgoChooser<megdnn::Opr>::get_profile_result(ExeContext& ctx, \
bool enable_update); \
template AlgoChooser<megdnn::Opr>::ImplAlgo \
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx); \
template void AlgoChooser<megdnn::Opr>::profile( \
ExeContext& ctx, bool require_reproducible); \
template std::vector<megdnn::Algorithm::SearchItem> \
AlgoChooser<megdnn::Opr>::flatten_search_space(const ExeContext& ctx); \
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::choose_by_profile( \ AlgoChooser<megdnn::Opr>::choose_by_profile( \
ExeContext& ctx, bool require_reproducible, bool enable_update); \ ExeContext& ctx, bool require_reproducible, bool enable_update); \
template size_t AlgoChooser<megdnn::Opr>::setup_algo( \ template size_t AlgoChooser<megdnn::Opr>::setup_algo( \
const TensorLayoutArray& layouts, megdnn::Opr* megdnn_opr, \
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
const MGBOpr* mgb_opr, bool allow_weight_preprocess); const MGBOpr* mgb_opr, bool allow_weight_preprocess);


MGB_FOREACH_FASTRUN_OPR(INST) MGB_FOREACH_FASTRUN_OPR(INST)
@@ -258,17 +390,109 @@ MGB_FOREACH_FASTRUN_OPR(INST)
#undef INST #undef INST


//////////////////////////////// ExeContext ///////////////////////////// //////////////////////////////// ExeContext /////////////////////////////
template <typename Opr>
AlgoChooser<Opr>::ExeContext::ExeContext(
const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr,
const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess)
: m_layouts{layouts},
m_megdnn_opr{megdnn_opr},
m_param{param_str},
m_base_mgb_opr{mgb_opr},
m_cn{cn},
m_execution_policy{execution_policy},
m_allow_weight_preprocess{allow_weight_preprocess} {
mgb_assert(m_layouts.size() == layouts.size());
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 ||
std::tuple_size<FixedTensorLayouts>::value == 5 ||
std::tuple_size<FixedTensorLayouts>::value == 8,
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for "
"deformable conv)");
}


template <typename Opr> template <typename Opr>
typename AlgoChooser<Opr>::ImplAlgo typename AlgoChooser<Opr>::ImplAlgo
AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(
bool require_reproducible) const {
MIDOUT_B(Opr,
midout_iv(MGB_HASH_STR(
"AlgoChooser::ExeContext::get_profile_result_from_cache")))
AlgoChooserProfileCache cache(m_cn,
profile_name(m_megdnn_opr).c_str());

typename Opr::Param origin_param = m_megdnn_opr->param();
AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(),
&origin_param, sizeof(origin_param)};
auto&& rst = cache.get(cache_key);
if (!rst.valid())
return {};

auto&& prof = rst.val();
std::unordered_map<std::string, ImplAlgo> algo_map;
for (auto i : get_all_candidates()) {
auto ins = algo_map.emplace(i.name.c_str(), i);
mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str());
}

if (prof.empty())
return {};
for (auto&& i : prof) {
if ((!require_reproducible || i.reproducible)) {
auto iter = algo_map.find(i.algo);
mgb_assert(iter != algo_map.end(),
"algorithm %s exists in "
"profiling result but not in algo_map; please "
"report this "
"bug; opr: %s{%s}, layouts: %s ",
i.algo.c_str(), m_base_mgb_opr->cname(),
m_base_mgb_opr->dyn_typeinfo()->name,
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out)
.c_str());
return iter->second;
}
}

mgb_log_error(
"Workspace requirement (%zu) could not be satisfied. Abort now "
"to "
"avoid further problems",
WorkspaceLimitGetter::get_workspace_limit(
m_base_mgb_opr->owner_graph(), m_cn,
m_execution_policy.workspace_limit));
mgb_trap();
MIDOUT_E
}

template <typename Opr>
typename AlgoChooser<Opr>::ImplExecutionPolicy
AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const { AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
auto opr = m_mgb_opr;
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
opr->owner_graph(), opr->comp_node(),
opr->execution_policy().workspace_limit);
return APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
args..., workspace_limit, reproducible),
m_layouts);
owner_graph(), m_cn, m_execution_policy.workspace_limit);
ImplExecutionPolicy policy;
policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
args..., workspace_limit, reproducible),
m_layouts).desc;

Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(algo, "Unknown algo description");
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list(
to_layout_array<Opr>(m_layouts), m_megdnn_opr);

FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn);
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(
_item.param);
typename AlgoChooser<_Opr>::ExeContext sub_ctx(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, m_base_mgb_opr, m_cn, m_execution_policy,
m_allow_weight_preprocess);
policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible));
});

return policy;
} }


template <typename Opr> template <typename Opr>
@@ -279,40 +503,58 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const {
APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts); APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts);
bool found = false; bool found = false;
for (size_t i = 0; i < ret.size(); ++i) { for (size_t i = 0; i < ret.size(); ++i) {
if (ret[i] == heu) {
if (ret[i].desc == heu.algo) {
found = true; found = true;
std::swap(ret[i], ret[0]); std::swap(ret[i], ret[0]);
break; break;
} }
} }

Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(heu.algo);
mgb_assert(palgo, "Unknown algo description");
mgb_assert(found, mgb_assert(found,
"algo %s got by heuristic not found in " "algo %s got by heuristic not found in "
"candidate list", "candidate list",
heu.name.c_str());
palgo->name());
return std::move(ret); return std::move(ret);
} }


template <typename Opr> template <typename Opr>
std::vector<typename AlgoChooser<Opr>::ImplAlgo>
AlgoChooser<Opr>::ExeContext::get_all_candidates_with_workspace_limit() const {
auto&& all_algos = get_all_candidates();
auto opr = m_mgb_opr;
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
opr->owner_graph(), opr->comp_node(),
opr->execution_policy().workspace_limit);
std::vector<ImplAlgo> ret;
for (auto&& algo : all_algos) {
if (get_workspace_size_bytes(algo) <= workspace_limit) {
ret.push_back(algo);
}
void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache(
bool require_reproducible,
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const {
if (!policy.algo.valid()) {
policy.algo = get_profile_result_from_cache(require_reproducible).desc;
mgb_assert(policy.algo.valid(),
"No cache found, maybe some error occured");
} }
return ret;

Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(algo, "Unknown algo description");
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list(
to_layout_array<Opr>(m_layouts), m_megdnn_opr);

FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn);
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(
_item.param);
typename AlgoChooser<_Opr>::ExeContext sub_ctx(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, m_base_mgb_opr, m_cn, m_execution_policy,
m_allow_weight_preprocess);
policy.sub_policy.push_back({});
sub_ctx.construct_execution_policy_from_cache(require_reproducible,
policy.sub_policy.back());
});

return;
} }


template <typename Opr> template <typename Opr>
size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes( size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes(
ImplAlgo algo) const {
m_megdnn_opr->execution_policy() = {algo};
const ImplExecutionPolicy& policy) const {
m_megdnn_opr->execution_policy() = policy;
size_t result; size_t result;
if_constexpr<opr_supports_preprocess<Opr>()>( if_constexpr<opr_supports_preprocess<Opr>()>(
[&](auto _) { [&](auto _) {
@@ -336,17 +578,13 @@ size_t AlgoChooser<Opr>::ExeContext::get_workspace_size_bytes(


template <typename Opr> template <typename Opr>
Maybe<AlgoChooserProfileCache::ResultEntry> Maybe<AlgoChooserProfileCache::ResultEntry>
AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
double& timeout) const {
AlgoChooser<Opr>::ExeContext::profile_single_algo(
const ImplExecutionPolicy& policy, double& timeout) const {
typename TimedProfiler<Opr>::Param param; typename TimedProfiler<Opr>::Param param;
auto name = algo.name.c_str();
// force check copy size <= dest len-1 from gcc8 for safe // force check copy size <= dest len-1 from gcc8 for safe
auto len = sizeof(param.algo_name);
strncpy(param.algo_name, name, len - 1);
param.algo_name[len - 1] = '\0';
mgb_assert(!param.algo_name[sizeof(param.algo_name) - 2],
"algo name too long: %s; len=%zu", name, strlen(name));
param.workspace = get_workspace_size_bytes(algo);
param.execution_policy =
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy);
param.workspace = get_workspace_size_bytes(policy);
for (int i = 0; i < arity; ++i) { for (int i = 0; i < arity; ++i) {
auto&& src = m_layouts[i]; auto&& src = m_layouts[i];
mgb_assert(src.format.is_default() && mgb_assert(src.format.is_default() &&
@@ -357,23 +595,25 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
src.to_string().c_str()); src.to_string().c_str());
param.dtypes[i] = src.dtype.enumv(); param.dtypes[i] = src.dtype.enumv();
} }
param.comp_node_loc = m_mgb_opr->output(0)->comp_node().locator();
param.comp_node_loc = m_cn.locator();
mgb_assert(param.shapes.size() == m_layouts.size()); mgb_assert(param.shapes.size() == m_layouts.size());
for (size_t i = 0; i < param.shapes.size(); ++i) for (size_t i = 0; i < param.shapes.size(); ++i)
param.shapes[i] = m_layouts[i]; param.shapes[i] = m_layouts[i];
param.opr_param = m_megdnn_opr->param(); param.opr_param = m_megdnn_opr->param();
param.allow_weight_preprocess = m_allow_weight_preprocess; param.allow_weight_preprocess = m_allow_weight_preprocess;


Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(palgo, "Unknown algo description");
auto rst = TimedProfiler<Opr>::profile(param, timeout); auto rst = TimedProfiler<Opr>::profile(param, timeout);
// MIOpen conv profiles all available algos when a specfic shape is // MIOpen conv profiles all available algos when a specfic shape is
// provided for the first time, which probably adds to the result time. // provided for the first time, which probably adds to the result time.
// Therefore, a second profile execution is needed. // Therefore, a second profile execution is needed.
if (strncmp(name, "MIOpen", 6) == 0)
if (strncmp(palgo->name(), "MIOpen", 6) == 0)
rst = TimedProfiler<Opr>::profile(param, timeout); rst = TimedProfiler<Opr>::profile(param, timeout);
if (!rst.valid()) if (!rst.valid())
return None; return None;
return AlgoChooserProfileCache::ResultEntry{ return AlgoChooserProfileCache::ResultEntry{
algo.name.c_str(), algo.is_reproducible, rst.val().time,
palgo->name(), palgo->is_reproducible(), rst.val().time,
param.workspace}; param.workspace};
} }


@@ -414,21 +654,34 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
} }


#define INST(Opr) \ #define INST(Opr) \
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \
template AlgoChooser<megdnn::Opr>::ExeContext::ExeContext( \
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, \
const CompNode& cn, \
const megdnn::param::ExecutionPolicy& execution_policy, \
bool allow_weight_preprocess); \
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \ AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \
bool reproducible) const; \ bool reproducible) const; \
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \
AlgoChooser<megdnn::Opr>::ExeContext::get_profile_result_from_cache( \
bool require_reproducible) const; \
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \
AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \ AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \
AlgoChooser<megdnn::Opr>::ExeContext:: \
get_all_candidates_with_workspace_limit() const; \
template size_t \ template size_t \
AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \ AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo) const; \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
policy) const; \
template void AlgoChooser<megdnn::Opr>::ExeContext:: \
construct_execution_policy_from_cache( \
bool require_reproducible, \
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
policy) const; \
template Maybe<AlgoChooserProfileCache::ResultEntry> \ template Maybe<AlgoChooserProfileCache::ResultEntry> \
AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \ AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo, double& timeout) \
const; \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \
policy, \
double& timeout) const;


MGB_FOREACH_FASTRUN_OPR(INST) MGB_FOREACH_FASTRUN_OPR(INST)




+ 91
- 13
src/opr/impl/search_policy/profiler.cpp View File

@@ -14,6 +14,8 @@


#include "../internal/invoke.h" #include "../internal/invoke.h"
#include "../internal/megdnn_opr_wrapper.inl" #include "../internal/megdnn_opr_wrapper.inl"
#include "megdnn/handle.h"
#include "megdnn/oprs/base.h"


#if MGB_ROCM #if MGB_ROCM
#include "hcc_detail/hcc_defs_prologue.h" #include "hcc_detail/hcc_defs_prologue.h"
@@ -32,12 +34,96 @@ MIDOUT_DECL(megbrain_opr_profile)
} \ } \
MIDOUT_END(); MIDOUT_END();


namespace {
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
std::string ret;
//! serialize AlgorithmDesc
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
uint32_t param_size = policy.algo.param.size();
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
ret += policy.algo.param;

//! serialize sub_policy
uint32_t size = policy.sub_policy.size();
megdnn::Algorithm::serialize_write_pod(size, ret);
for (auto&& sub : policy.sub_policy) {
ret += serialize_policy(sub);
}
return ret;
}

megdnn::ExecutionPolicy deserialize_policy(const char* buf, uint32_t size,
uint32_t& offset) {
megdnn::ExecutionPolicy ret;
#define cb(_val, _type) \
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
offset += sizeof(_val)

cb(ret.algo.handle_type, megdnn::Handle::HandleType);
cb(ret.algo.type, uint32_t);

uint32_t param_size = 0;
cb(param_size, uint32_t);
if (param_size > 0) {
ret.algo.param = std::string(buf + offset, param_size);
offset += param_size;
}

uint32_t nr_policy = 0;
cb(nr_policy, uint32_t);
#undef cb

for (uint32_t i = 0; i < nr_policy; i++) {
ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
}
return ret;
}
}

namespace mgb { namespace mgb {
namespace opr { namespace opr {
#define APPLY(statement, ...) \ #define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \ mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__)) std::tuple_cat(__VA_ARGS__))


////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////

template <typename Opr>
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(
const megdnn::ExecutionPolicy& policy) {
ExecutionPolicyBlob ret;
std::string serialize_bin = serialize_policy(policy);
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
ret.size = serialize_bin.size();
return ret;
}

template <typename Opr>
megdnn::ExecutionPolicy
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() const {
uint32_t offset = 0;
auto&& ret = deserialize_policy(data, size, offset);
mgb_assert(offset == size);
return std::move(ret);
}

#define INST(Opr) \
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \
const megdnn::ExecutionPolicy& policy); \
template megdnn::ExecutionPolicy \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() \
const;

MGB_FOREACH_FASTRUN_OPR(INST)
#undef INST


////////////////// TimedProfiler //////////////////////////////

template <typename Opr> template <typename Opr>
const double TimedProfiler<Opr>::timeout_setting = const double TimedProfiler<Opr>::timeout_setting =
TimedProfiler<Opr>::init_timeout_setting(); TimedProfiler<Opr>::init_timeout_setting();
@@ -99,18 +185,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
} }


megdnn_opr->param() = param.opr_param; megdnn_opr->param() = param.opr_param;
{
typename Opr::AlgorithmInfo algo;
for (auto i :
APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) {
if (!strcmp(i.name.c_str(), param.algo_name)) {
algo = i;
break;
}
}
mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name);
megdnn_opr->execution_policy() = {algo};
}
megdnn_opr->execution_policy() = param.execution_policy.deserialize();


// Allocate preprocessed weight buffers. // Allocate preprocessed weight buffers.
TensorLayoutArray preprocessed_layout; TensorLayoutArray preprocessed_layout;
@@ -222,13 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}); });
ev_end->record(); ev_end->record();


megdnn::Algorithm* algo = megdnn_opr->get_algorithm_from_desc(
megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5; double next_report_time = 0.5;
while (!ev_end->finished()) { while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) { if (timer.get_secs() >= next_report_time) {
mgb_log_warn( mgb_log_warn(
"profiling conv algo %s already took %.3f/%.3f secs" "profiling conv algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
param.algo_name, timer.get_secs(), param.actual_timeout);
algo->name(), timer.get_secs(), param.actual_timeout);
next_report_time = timer.get_secs() + 1; next_report_time = timer.get_secs() + 1;
} }
using namespace std::literals; using namespace std::literals;


+ 2
- 2
src/opr/include/megbrain/opr/blas.h View File

@@ -46,7 +46,7 @@ private:
static bool check_layout(const TensorLayout& layout, int transpose); static bool check_layout(const TensorLayout& layout, int transpose);


//! store the policy of all transpose situations //! store the policy of all transpose situations
megdnn::MatrixMul::ExecutionPolicy m_cadidate_execution_policies[4];
megdnn::ExecutionPolicy m_cadidate_execution_policies[4];
}; };


/*! /*!
@@ -76,7 +76,7 @@ private:


static bool check_layout(const TensorLayout& layout, bool transpose); static bool check_layout(const TensorLayout& layout, bool transpose);
//! store the policy of all transpose situations //! store the policy of all transpose situations
megdnn::BatchedMatrixMul::ExecutionPolicy m_cadidate_execution_policies[4];
megdnn::ExecutionPolicy m_cadidate_execution_policies[4];
}; };


/*! /*!


+ 85
- 37
src/opr/include/megbrain/opr/search_policy/algo_chooser.h View File

@@ -12,9 +12,14 @@


#pragma once #pragma once


#include <memory>
#include "megbrain/graph/cg.h"
#include "megbrain/graph/operator_node.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/search_policy/profiler.h"
#include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/blas.h" #include "megbrain/opr/blas.h"
#include "megdnn/oprs/base.h"


template <class MegDNNOpr> template <class MegDNNOpr>
struct MegDNNOpr2MGBOpr; struct MegDNNOpr2MGBOpr;
@@ -49,52 +54,64 @@ class AlgoChooser {
static constexpr int arity = OprArityTrait<Opr>::arity; static constexpr int arity = OprArityTrait<Opr>::arity;


using ImplAlgo = typename Opr::AlgorithmInfo; using ImplAlgo = typename Opr::AlgorithmInfo;
using ImplExecutionPolicy = megdnn::ExecutionPolicy;
using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr;
using TensorLayoutArray = std::array<TensorLayout, arity>;


public:
using FixedTensorLayouts = std::array<TensorLayout, arity>;
class ExeContext { class ExeContext {
const TensorLayoutArray& m_layouts;
FixedTensorLayouts m_layouts;
Opr* m_megdnn_opr; Opr* m_megdnn_opr;
const MGBOpr* m_mgb_opr;
std::string m_param;
const cg::OperatorNodeBase* m_base_mgb_opr;
CompNode m_cn;
megdnn::param::ExecutionPolicy m_execution_policy;
bool m_allow_weight_preprocess; bool m_allow_weight_preprocess;


public: public:
ExeContext(const TensorLayoutArray& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr, bool allow_weight_preprocess)
: m_layouts{layouts},
m_megdnn_opr{megdnn_opr},
m_mgb_opr{mgb_opr},
m_allow_weight_preprocess{allow_weight_preprocess} {
mgb_assert(m_layouts.size() == layouts.size());
static_assert(
std::tuple_size<TensorLayoutArray>::value == 3 ||
std::tuple_size<TensorLayoutArray>::value == 5 ||
std::tuple_size<TensorLayoutArray>::value == 8,
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for "
"deformable conv)");
}
ExeContext(const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str,
const cg::OperatorNodeBase* mgb_opr, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess);


Opr* megdnn_opr() const { return m_megdnn_opr; } Opr* megdnn_opr() const { return m_megdnn_opr; }


const MGBOpr* mgb_opr() const { return m_mgb_opr; }

const TensorLayout& inp_layout(size_t idx) const { const TensorLayout& inp_layout(size_t idx) const {
return m_layouts[idx]; return m_layouts[idx];
} }


const TensorLayoutArray& layouts() const { return m_layouts; }
cg::ComputingGraph* owner_graph() const {
return m_base_mgb_opr->owner_graph();
}
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; }
const megdnn::param::ExecutionPolicy& execution_policy() const {
return m_execution_policy;
}
CompNode comp_node() const { return m_cn; }
const std::string& param() const { return m_param; }

bool allow_weight_preprocess() const {
return m_allow_weight_preprocess;
}

megdnn::Algorithm* get_algorithm_from_desc(
const megdnn::Algorithm::Info::Desc& desc) const {
return m_megdnn_opr->get_algorithm_from_desc(desc);
}

const FixedTensorLayouts& layouts() const { return m_layouts; }


ImplAlgo choose_by_heuristic(bool reproducible = false) const;
ImplExecutionPolicy choose_by_heuristic(
bool reproducible = false) const;


//! get all candidate algos, and the one choose_by_heuristic() is //! get all candidate algos, and the one choose_by_heuristic() is
//! put first //! put first
std::vector<ImplAlgo> get_all_candidates() const; std::vector<ImplAlgo> get_all_candidates() const;


//! get candidate algos with workspace limit.
std::vector<ImplAlgo> get_all_candidates_with_workspace_limit() const;

//! get workspace size required for specific algo
size_t get_workspace_size_bytes(ImplAlgo algo) const;
//! get workspace size required for specific execution policy
size_t get_workspace_size_bytes(
const ImplExecutionPolicy& policy) const;


/*! /*!
* \brief profile a single algorithm * \brief profile a single algorithm
@@ -106,28 +123,59 @@ class AlgoChooser {
* timeout used during profiling * timeout used during profiling
*/ */
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo(
ImplAlgo algo, double& timeout) const;
const ImplExecutionPolicy& policy, double& timeout) const;

//! get all profile algorithm from cache, return invalid if not exists
ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;

/**
* \brief construct execution policy from cache.
*
* \param require_reproducible select algo which is reproducible
* \param policy execution policy
*/
void construct_execution_policy_from_cache(
bool require_reproducible, ImplExecutionPolicy& policy) const;


private: private:
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
}; };


//! entrance for getting algorithm according to execution strategy
static ImplAlgo get_algo(ExeContext& ctx);

//! get all profile result, either by retrieving cache or profiling
static AlgoChooserProfileCache::Result get_profile_result(
ExeContext& ctx, bool enable_update);
template<typename U>
friend class AlgoChooser;


static ImplAlgo choose_by_profile(ExeContext& ctx,
bool require_reproducible,
bool enable_update = true);
private:
//! entrance for getting algorithm according to execution strategy
static ImplExecutionPolicy get_policy(ExeContext& ctx);


//! profile and save to cache
static void profile(ExeContext& ctx, bool require_reproducible);

static ImplExecutionPolicy choose_by_profile(ExeContext& ctx,
bool require_reproducible,
bool enable_update = true);

/**
* flatten search space in postorder traversal
* The subopr search construct a search tree
*
* A
* / \
* B1B2 C
* / \
* D1D2D3 E
* We use postorder traverse the search tree.
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A
*/
static std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
const ExeContext& ctx);


public: public:
/*! /*!
* \brief setup algorithm and return workspace size * \brief setup algorithm and return workspace size
*/ */
static size_t setup_algo(const TensorLayoutArray& layouts, Opr* megdnn_opr,
static size_t setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr, const MGBOpr* mgb_opr,
bool allow_weight_preprocess = false); bool allow_weight_preprocess = false);
}; };


+ 2
- 2
src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h View File

@@ -28,9 +28,9 @@ namespace mixin {
class AlgoChooserHelper : cg::OperatorNodeMixinBase { class AlgoChooserHelper : cg::OperatorNodeMixinBase {
public: public:
using ExecutionPolicy = megdnn::param::ExecutionPolicy; using ExecutionPolicy = megdnn::param::ExecutionPolicy;
using AlgorithmInfo = megdnn::detail::Algorithm::Info;
using AlgorithmPolicy = megdnn::ExecutionPolicy;
using AlgoChooserHook = using AlgoChooserHook =
std::function<AlgorithmInfo(const cg::OperatorNodeBase*)>;
std::function<AlgorithmPolicy(const cg::OperatorNodeBase*)>;


const ExecutionPolicy& execution_policy() const { const ExecutionPolicy& execution_policy() const {
if (!m_policy_accessed) { if (!m_policy_accessed) {


+ 12
- 1
src/opr/include/megbrain/opr/search_policy/profiler.h View File

@@ -18,6 +18,7 @@
#include "megbrain/comp_node.h" #include "megbrain/comp_node.h"


#include "megdnn/basic_types.h" #include "megdnn/basic_types.h"
#include "megdnn/oprs/base.h"
#include "megdnn/oprs/linalg.h" #include "megdnn/oprs/linalg.h"
#include "megdnn/oprs/nn.h" #include "megdnn/oprs/nn.h"


@@ -139,7 +140,17 @@ class TimedProfiler {


public: public:
struct Param { struct Param {
char algo_name[128];
struct ExecutionPolicyBlob {
//! enlarge the max size if needed
constexpr static size_t MAX_SIZE_IN_BYTES = 10240;
char data[MAX_SIZE_IN_BYTES];
uint32_t size;

static ExecutionPolicyBlob serialize(
const megdnn::ExecutionPolicy& policy);
megdnn::ExecutionPolicy deserialize() const;
};
ExecutionPolicyBlob execution_policy;
size_t workspace; size_t workspace;
megdnn::DTypeEnum dtypes[arity]; megdnn::DTypeEnum dtypes[arity];
CompNode::Locator comp_node_loc; CompNode::Locator comp_node_loc;


+ 190
- 65
src/opr/test/dnn/convolution.cpp View File

@@ -20,11 +20,13 @@
#include "megbrain/opr/basic_arith.h" #include "megbrain/opr/basic_arith.h"
#include "megbrain/gopt/inference.h" #include "megbrain/gopt/inference.h"
#include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/tensor_manip.h"
#include "megdnn/dtype.h"
#include "megdnn/oprs/base.h" #include "megdnn/oprs/base.h"


#include <gmock/gmock.h> #include <gmock/gmock.h>


#include <cmath> #include <cmath>
#include <memory>
#include <random> #include <random>


using namespace mgb; using namespace mgb;
@@ -37,6 +39,73 @@ using Mode = Param::Mode;


Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION}; Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};


void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& dest,
const opr::ConvolutionBackwardData::Param& param) {
mgb_assert(param.format == Param::Format::NCHW);
auto &&data = *inps[0], &&filter = *inps[1];
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
size_t GROUP, ICPG, OCPG, FH, FW;

if (param.sparse == Param::Sparse::DENSE) {
GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1),
FH = filter.shape(2), FW = filter.shape(3);
} else {
mgb_assert(param.sparse == Param::Sparse::GROUP);
GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
FH = filter.shape(3), FW = filter.shape(4);
}
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
TensorShape{N, OCPG * GROUP, OH, OW});
auto&& out = *dest;
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();

#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)

FOR3(n, N, group, GROUP, icg, ICPG)
FOR2(ih, IH, iw, IW) {
float scale = *(dptr++);

FOR3(ocg, OCPG, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int oh = int(ih * param.stride_h + fh * param.dilate_h) -
int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) -
int(param.pad_w);
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
oh * ol.stride[2] + ow;
size_t flt_off = 0;
if (param.sparse == Param::Convolution::Sparse::DENSE) {
flt_off = icg * fl.stride[0] +
ocg * fl.stride[1] + fh * fl.stride[2] + fw;
} else {
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fh * fl.stride[3] + fw;
}
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR3
#undef FOR2
}

void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps, void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& out, std::shared_ptr<HostTensorND>& out,
const opr::ConvolutionBackwardFilter::Param& param) { const opr::ConvolutionBackwardFilter::Param& param) {
@@ -370,7 +439,8 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
PersistentCacheHook cache_hook{on_get}; PersistentCacheHook cache_hook{on_get};


#if MGB_ENABLE_FASTRUN #if MGB_ENABLE_FASTRUN
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else #else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif #endif
@@ -406,6 +476,95 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
} }
} }


TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
REQUIRE_GPU(1);
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
param.compute_mode = Param::ComputeMode::FLOAT32;
using Policy = opr::Convolution::ExecutionPolicy;
using S = Policy::Strategy;

auto gen_bfp16 = [](HostTensorND& dest) {
RNGxorshf rng{next_rand_seed()};
auto rand_real = [&rng]() {
std::uniform_real_distribution<float> dist(-1, 1);
return dist(rng);
};
auto ptr = dest.ptr<dt_bfloat16>();
size_t elems = dest.shape().total_nr_elems();
for (size_t i = 0; i < elems; i++) {
ptr[i] = dt_bfloat16(rand_real());
}
};

auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::BFloat16{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
}
return ret;
};

auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::Float32{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
}
return ret;
};

int nr_get = 0;
auto on_get = [&nr_get](const std::string&, const void*, size_t,
const void*, size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};

#if MGB_ENABLE_FASTRUN
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif
using Checker = AutoOprChecker<2, 1>;

auto make_graph = [&](const Checker::SymInpArray& inputs)
-> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::ConvolutionBackwardData::make_deconv(
inputs[0], inputs[1], param, policy)};
};

auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute(
{bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out,
param);
dest[0] = *f32_to_bf16(out);
};

Checker::RunOptions opt;
opt.outputs_max_err = 1e-3;
nr_get = 0;
Checker(make_graph, fwd)
.disable_grad_check()
.set_input_dtype(0, dtype::BFloat16{})
.set_input_dtype(1, dtype::BFloat16{})
.set_input_generator(0, gen_bfp16)
.set_input_generator(1, gen_bfp16)
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
}
}

TEST(TestOprDNN, Deconvolution) { TEST(TestOprDNN, Deconvolution) {
// dilated grouped deconv // dilated grouped deconv
using Checker = AutoOprChecker<2, 1>; using Checker = AutoOprChecker<2, 1>;
@@ -420,55 +579,9 @@ TEST(TestOprDNN, Deconvolution) {
}; };


auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto &&data = *inp[0], &&filter = *inp[1];
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
size_t GROUP = filter.shape(0), ICPG = filter.shape(1),
OCPG = filter.shape(2), FH = filter.shape(3),
FW = filter.shape(4);
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
auto &&out = dest[0];
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h,
param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w,
param.dilate_w);
out.resize({N, OCPG * GROUP, OH, OW});
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();

#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)

FOR3(n, N, group, GROUP, icg, ICPG)
FOR2(ih, IH, iw, IW) {
float scale = *(dptr++);

FOR3(ocg, OCPG, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int oh = int(ih * param.stride_h + fh * param.dilate_h) -
int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) -
int(param.pad_w);
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
oh * ol.stride[2] + ow,
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fh * fl.stride[3] + fw;
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR3
#undef FOR2
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
}; };


Checker::RunOptions opt; Checker::RunOptions opt;
@@ -1547,7 +1660,8 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) {
PersistentCacheHook cache_hook{on_get}; PersistentCacheHook cache_hook{on_get};


#if MGB_ENABLE_FASTRUN #if MGB_ENABLE_FASTRUN
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else #else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif #endif
@@ -2004,29 +2118,34 @@ TEST(TestOprDNN, HeuristicReproducible) {
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);


auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>(
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>( static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr()) bwd_flt->owner_opr())
->megdnn_opr())
->execution_policy()
.algo;
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo =
megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");
if (strategy == S::HEURISTIC_REPRODUCIBLE) { if (strategy == S::HEURISTIC_REPRODUCIBLE) {
EXPECT_TRUE(algo.is_reproducible);
EXPECT_TRUE(palgo->is_reproducible());
} }
algo_name0 = algo.name.c_str();
algo_name0 = palgo->name();
} }
{ {
Checker checker(make_graph, fwd); Checker checker(make_graph, fwd);
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr())
->megdnn_opr())
->execution_policy()
.algo;
algo_name1 = algo.name.c_str();
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr())
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo =
megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");

algo_name1 = palgo->name();
} }
EXPECT_TRUE(algo_name0 == algo_name1); EXPECT_TRUE(algo_name0 == algo_name1);
} }
@@ -2286,6 +2405,8 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
MockAlgorithm algo; MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillRepeatedly(Return(&algo)); .WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.WillRepeatedly(Return(0)); .WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
@@ -2318,6 +2439,9 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)) EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(filter_layout)); .WillRepeatedly(Return(filter_layout));


EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));

Expectation algo_call = Expectation algo_call =
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillOnce(Return(&algo)); .WillOnce(Return(&algo));
@@ -2349,7 +2473,6 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
pf->tensors[0].ptr<float>()[0] = 114.514f; pf->tensors[0].ptr<float>()[0] = 114.514f;
pf->tensors[1].ptr<float>()[0] = 1926.0817f; pf->tensors[1].ptr<float>()[0] = 1926.0817f;
})); }));

// Run the graph multiple times. // Run the graph multiple times.
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
if (i > 0) { if (i > 0) {
@@ -2381,6 +2504,8 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) {
MockAlgorithm algo; MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillRepeatedly(Return(&algo)); .WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.WillRepeatedly(Return(0)); .WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))


+ 49
- 49
src/plugin/test/opr_io_dump_text_out.h View File

@@ -16,157 +16,157 @@ namespace {
const char* EXPECTED_TEXT_OUT_REC[3] = { const char* EXPECTED_TEXT_OUT_REC[3] = {
// rec level 0 // rec level 0
R"OUTPUT( R"OUTPUT(
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s
val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: [2.352, 0.1114, -0.2721] s [i0]var11: [2.352, 0.1114, -0.2721] s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: [2] s [i0]var3: [2] s
[i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s [i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s
val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s [i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s
[i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s [i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s
val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s
val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: [0.05521, 0.724, 1.134] s [i0]var11: [0.05521, 0.724, 1.134] s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: [2] s [i0]var3: [2] s
[i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s [i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s
val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s [i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s
[i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s [i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s
val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s
val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: [-0.5069, 0.4525, 0.1695] s [i0]var11: [-0.5069, 0.4525, 0.1695] s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: [2] s [i0]var3: [2] s
[i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s [i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s
val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s [i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s
[i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s [i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s
val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s
val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: [-0.03637, 2.111, 0.3236] s [i0]var11: [-0.03637, 2.111, 0.3236] s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: [2] s [i0]var3: [2] s
[i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s [i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s
val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s [i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s
[i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s [i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s
val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={5(4),4(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={5(4),4(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s
var17 produced: name=var17 layout={5(4),4(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={5(4),4(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s
val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s
var11 produced: name=var11 layout={1(4),4(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(4),4(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s
var13 produced: name=var13 layout={5(0),4(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={5(0),4(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: [-1.199, -1.02, 1.098, -1.472] s [i0]var11: [-1.199, -1.02, 1.098, -1.472] s
[i1]var9: <host value[s]> [5, 4] s [i1]var9: <host value[s]> [5, 4] s
val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s
var15 produced: name=var15 layout={5(4),4(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={5(4),4(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: [2] s [i0]var3: [2] s
[i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s [i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s
val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s
var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={5(4),4(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: [0, 0, 3.401, 0, 0, ...] s [i0]var15: [0, 0, 3.401, 0, 0, ...] s
[i1]var17: [0, 0, 1.098, 0, 0, ...] s [i1]var17: [0, 0, 1.098, 0, 0, ...] s
@@ -176,33 +176,33 @@ var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18
// rec level 1 // rec level 1
R"OUTPUT( R"OUTPUT(
==== begin lazy value recording ==== begin lazy value recording
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: <see lazy value below> s val: <see lazy value below> s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: <see lazy value below> s val: <see lazy value below> s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: <see lazy value below> s [i0]var1: <see lazy value below> s
val: <see lazy value below> s val: <see lazy value below> s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: <see lazy value below> s [i0]var1: <see lazy value below> s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: <see lazy value below> s val: <see lazy value below> s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: <see lazy value below> s [i0]var11: <see lazy value below> s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: <see lazy value below> s val: <see lazy value below> s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: <see lazy value below> s [i0]var3: <see lazy value below> s
[i1]var13: <see lazy value below> s [i1]var13: <see lazy value below> s
val: <see lazy value below> s val: <see lazy value below> s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: <see lazy value below> s [i0]var15: <see lazy value below> s
[i1]var17: <see lazy value below> s [i1]var17: <see lazy value below> s
@@ -242,33 +242,33 @@ var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
// rec level 2 // rec level 2
R"OUTPUT( R"OUTPUT(
==== begin lazy value recording ==== begin lazy value recording
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps: deps:
val: <see lazy value below> s val: <see lazy value below> s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps: deps:
val: <see lazy value below> s val: <see lazy value below> s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps: deps:
[i0]var1: <see lazy value below> s [i0]var1: <see lazy value below> s
val: <see lazy value below> s val: <see lazy value below> s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps: deps:
[i0]var1: <see lazy value below> s [i0]var1: <see lazy value below> s
[i1]var5: <host value[s]> [0] s [i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s [i2]var7: <host value[s]> [1] s
val: <see lazy value below> s val: <see lazy value below> s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps: deps:
[i0]var11: <see lazy value below> s [i0]var11: <see lazy value below> s
[i1]var9: <host value[s]> [2, 3] s [i1]var9: <host value[s]> [2, 3] s
val: <see lazy value below> s val: <see lazy value below> s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps: deps:
[i0]var3: <see lazy value below> s [i0]var3: <see lazy value below> s
[i1]var13: <see lazy value below> s [i1]var13: <see lazy value below> s
val: <see lazy value below> s val: <see lazy value below> s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps: deps:
[i0]var15: <see lazy value below> s [i0]var15: <see lazy value below> s
[i1]var17: <see lazy value below> s [i1]var17: <see lazy value below> s


Loading…
Cancel
Save