GitOrigin-RevId: 94e333ec80
release-1.5
@@ -91,6 +91,11 @@ class MaxTensorDiff : public OperatorBase { | |||||
void check_exec(const TensorLayout& layout1, | void check_exec(const TensorLayout& layout1, | ||||
const TensorLayout& layout2, size_t workspace_in_bytes); | const TensorLayout& layout2, size_t workspace_in_bytes); | ||||
}; | }; | ||||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||||
const param::ConvBias::Format format); | |||||
} // namespace megdnn | } // namespace megdnn | ||||
#include "megdnn/internal/opr_header_epilogue.h" | #include "megdnn/internal/opr_header_epilogue.h" | ||||
@@ -318,36 +318,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||||
megdnn_assert(false); | megdnn_assert(false); | ||||
} | } | ||||
} | } | ||||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||||
const param::ConvBias::Format format) { | |||||
bool share_in_channel = false; | |||||
if (format == param::ConvBias::Format::NCHW || | |||||
format == param::ConvBias::Format::NCHW4_NCHW) { | |||||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} else if (format == param::ConvBias::Format::NHWC) { | |||||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||||
bias[2] == 1); | |||||
} else if (format == param::ConvBias::Format::NCHW4 || | |||||
format == param::ConvBias::Format::NCHW8 || | |||||
format == param::ConvBias::Format::NCHW32 || | |||||
format == param::ConvBias::Format::NCHW64 || | |||||
format == param::ConvBias::Format::NCHW4_NCHW32 || | |||||
format == param::ConvBias::Format::NCHW32_NCHW4) { | |||||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} else if (format == param::ConvBias::Format::NHWCD4) { | |||||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||||
bias[3] == 1); | |||||
} else { | |||||
megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||||
share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} | |||||
return share_in_channel; | |||||
} | |||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -22,8 +22,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||||
const TensorND* dst_tensor, | const TensorND* dst_tensor, | ||||
const TensorND* bias_tensor); | const TensorND* bias_tensor); | ||||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||||
const param::ConvBias::Format format); | |||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -10,6 +10,7 @@ | |||||
*/ | */ | ||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "megdnn/oprs/utils.h" | |||||
#include "megdnn/handle.h" | #include "megdnn/handle.h" | ||||
#include <cstdarg> | #include <cstdarg> | ||||
@@ -344,4 +345,33 @@ size_t& CpuNDRange::operator[](size_t idx) { | |||||
return m_dim[idx]; | return m_dim[idx]; | ||||
} | } | ||||
bool megdnn::check_bias_share_in_channel(const TensorLayout& bias, | |||||
const param::ConvBias::Format format) { | |||||
bool share_in_channel = false; | |||||
if (format == param::ConvBias::Format::NCHW || | |||||
format == param::ConvBias::Format::NCHW4_NCHW) { | |||||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} else if (format == param::ConvBias::Format::NHWC) { | |||||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||||
bias[2] == 1); | |||||
} else if (format == param::ConvBias::Format::NCHW4 || | |||||
format == param::ConvBias::Format::NCHW8 || | |||||
format == param::ConvBias::Format::NCHW32 || | |||||
format == param::ConvBias::Format::NCHW64 || | |||||
format == param::ConvBias::Format::NCHW4_NCHW32 || | |||||
format == param::ConvBias::Format::NCHW32_NCHW4) { | |||||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} else if (format == param::ConvBias::Format::NHWCD4) { | |||||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||||
bias[3] == 1); | |||||
} else { | |||||
megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||||
share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||||
bias[3] == 1); | |||||
} | |||||
return share_in_channel; | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -158,6 +158,11 @@ R"__usage__( | |||||
R"__usage__( | R"__usage__( | ||||
--fast-run-algo-policy <path> | --fast-run-algo-policy <path> | ||||
It will read the cache file before profile, and save new fastrun in cache file. | It will read the cache file before profile, and save new fastrun in cache file. | ||||
--fast-run-shared-batch-size | |||||
Set the batch size used during fastrun, Note that it may not be the same as the actual running batch size | |||||
--binary-equal-between-batch | |||||
Each batch of output is promised binary equal if each batch of input is binary equal. | |||||
Note that if this option is turned on, `--reproducible` will also be turned on. | |||||
--reproducible | --reproducible | ||||
Enable choose algo which is reproducible. It mainly used for cudnn algos. | Enable choose algo which is reproducible. It mainly used for cudnn algos. | ||||
See https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#reproducibility | See https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#reproducibility | ||||
@@ -1356,6 +1361,20 @@ Args Args::from_argv(int argc, char **argv) { | |||||
ret.fast_run_cache_path = argv[i]; | ret.fast_run_cache_path = argv[i]; | ||||
continue; | continue; | ||||
} | } | ||||
if (!strcmp(argv[i], "--fast-run-shared-batch-size")) { | |||||
++i; | |||||
mgb_assert(i < argc, | |||||
"value not given for --fast-run-shared-batch-size"); | |||||
int32_t batch_size = std::stoi(argv[i]); | |||||
mgb_assert(batch_size >= 0); | |||||
graph_opt.fast_run_config.shared_batch_size = batch_size; | |||||
continue; | |||||
} | |||||
if (!strcmp(argv[i], "--binary-equal-between-batch")) { | |||||
graph_opt.fast_run_config.binary_equal_between_batch = true; | |||||
ret.reproducible = true; | |||||
continue; | |||||
} | |||||
if (!strcmp(argv[i], "--reproducible")) { | if (!strcmp(argv[i], "--reproducible")) { | ||||
ret.reproducible = true; | ret.reproducible = true; | ||||
continue; | continue; | ||||
@@ -1452,6 +1471,14 @@ Args Args::from_argv(int argc, char **argv) { | |||||
return ret; | return ret; | ||||
} | } | ||||
#if MGB_ENABLE_FASTRUN | |||||
if (graph_opt.fast_run_config.shared_batch_size) { | |||||
mgb_assert(ret.use_fast_run || ret.use_full_run || | |||||
!ret.fast_run_cache_path.empty(), | |||||
"--fast-run-shared-batch-size should be used with " | |||||
"--fast-run/--full-run/--fast-run-algo-policy"); | |||||
} | |||||
#endif | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -502,7 +502,28 @@ class ComputingGraph : public std::enable_shared_from_this<ComputingGraph>, | |||||
//! contains any user data associated with this graph | //! contains any user data associated with this graph | ||||
UserDataContainer user_data; | UserDataContainer user_data; | ||||
}; // Options | |||||
//! Control parameter for fast run | |||||
struct FastRunConfig { | |||||
/*! | |||||
* the batch size used by fastrun | |||||
* | |||||
* Non-zero value means that fastrun use this batch size | |||||
* regardless of the batch size of the model | |||||
* | |||||
* Zero means fastrun use batch size of the model | |||||
*/ | |||||
uint32_t shared_batch_size = 0; | |||||
/*! | |||||
* \brief if the content of each input batch is binary equal, | |||||
* whether the content of each output batch is promised to be | |||||
* equal | |||||
*/ | |||||
bool binary_equal_between_batch = false; | |||||
} fast_run_config; | |||||
}; // Options | |||||
Options& options() { | Options& options() { | ||||
return m_options; | return m_options; | ||||
@@ -60,21 +60,31 @@ std::string profile_name(Opr* opr) { | |||||
template <typename Opr> | template <typename Opr> | ||||
std::string format_fixlayouts( | std::string format_fixlayouts( | ||||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | ||||
size_t arity_in, size_t arity_out) { | |||||
size_t arity_in, size_t arity_out, | |||||
const std::string& delimiter = " -> ") { | |||||
std::string ret; | std::string ret; | ||||
ret.append(": tensor layouts("); | |||||
for (size_t i = 0; i < arity_in; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
if (arity_in) { | |||||
ret.append("("); | |||||
for (size_t i = 0; i < arity_in; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i].to_string() + " "); | |||||
} | } | ||||
ret.append(layouts[i].to_string() + " "); | |||||
ret.append(")"); | |||||
} | |||||
if (arity_in && arity_out) { | |||||
ret.append(delimiter); | |||||
} | } | ||||
ret.append(") -> ("); | |||||
for (size_t i = 0; i < arity_out; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
if (arity_out) { | |||||
ret.append("("); | |||||
for (size_t i = 0; i < arity_out; ++i) { | |||||
if (i) { | |||||
ret.append(", "); | |||||
} | |||||
ret.append(layouts[i + arity_in].to_string() + " "); | |||||
} | } | ||||
ret.append(layouts[i + arity_in].to_string() + " "); | |||||
ret.append(")"); | |||||
} | } | ||||
return ret; | return ret; | ||||
} | } | ||||
@@ -247,7 +257,7 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||||
CircularDepsChecker& checker) { | CircularDepsChecker& checker) { | ||||
auto&& search_item = megdnn::Algorithm::SearchItem{ | auto&& search_item = megdnn::Algorithm::SearchItem{ | ||||
OprTypeFromOprTrait<Opr>::opr_type, helper.param(), | OprTypeFromOprTrait<Opr>::opr_type, helper.param(), | ||||
to_layout_array<Opr>(helper.layouts())}; | |||||
to_layout_array<Opr>(helper.fastrun_layouts())}; | |||||
checker.put(search_item); | checker.put(search_item); | ||||
std::vector<megdnn::Algorithm::SearchItem> ret; | std::vector<megdnn::Algorithm::SearchItem> ret; | ||||
for (auto algo_info : helper.get_all_candidates()) { | for (auto algo_info : helper.get_all_candidates()) { | ||||
@@ -255,8 +265,9 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||||
helper.get_algorithm_from_desc(algo_info.desc); | helper.get_algorithm_from_desc(algo_info.desc); | ||||
mgb_assert(algo, "Unknown algo description"); | mgb_assert(algo, "Unknown algo description"); | ||||
std::vector<megdnn::Algorithm::SearchItem>&& sub_items = | std::vector<megdnn::Algorithm::SearchItem>&& sub_items = | ||||
algo->get_subopr_list(to_layout_array<Opr>(helper.layouts()), | |||||
helper.megdnn_opr()); | |||||
algo->get_subopr_list( | |||||
to_layout_array<Opr>(helper.fastrun_layouts()), | |||||
helper.megdnn_opr()); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | FOREACH_OPR_TYPE_DISPATCH(sub_items, { | ||||
auto&& megdnn_opr = | auto&& megdnn_opr = | ||||
@@ -323,6 +334,166 @@ static Algorithm::Info::Desc deserialize_read_pod(const std::string& data, | |||||
namespace mgb { | namespace mgb { | ||||
namespace opr { | namespace opr { | ||||
template <class Opr> | |||||
class LayoutsModifier { | |||||
using FixedTensorLayouts = typename AlgoChooser<Opr>::FixedTensorLayouts; | |||||
public: | |||||
static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {} | |||||
private: | |||||
//! index of batch in tensor, 3 for CHWN4 e.g. | |||||
static size_t index_of_batch(const typename Opr::Param&) { return 0; } | |||||
//! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g. | |||||
static std::vector<size_t> sm_indices_contain_batch; | |||||
}; | |||||
template <class Opr> | |||||
std::vector<size_t> LayoutsModifier<Opr>::sm_indices_contain_batch = {}; | |||||
#define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ | |||||
template <> \ | |||||
class LayoutsModifier<opr> { \ | |||||
public: \ | |||||
using FixedTensorLayouts = \ | |||||
typename AlgoChooser<opr>::FixedTensorLayouts; \ | |||||
static void on(FixedTensorLayouts& layouts, const opr::Param& param, \ | |||||
size_t new_batch_size) { \ | |||||
size_t batch_index = index_of_batch(param); \ | |||||
for (size_t index : sm_indices_contain_batch) { \ | |||||
layouts.at(index)[batch_index] = new_batch_size; \ | |||||
} \ | |||||
} \ | |||||
\ | |||||
private: \ | |||||
static size_t index_of_batch(const opr::Param&) { return 0; } \ | |||||
static std::vector<size_t> sm_indices_contain_batch; \ | |||||
}; \ | |||||
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs; | |||||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DForward, | |||||
(std::initializer_list<size_t>{0, 2})) | |||||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardData, | |||||
(std::initializer_list<size_t>{1, 2})) | |||||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardFilter, | |||||
(std::initializer_list<size_t>{0, 1})) | |||||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchedMatrixMul, | |||||
(std::initializer_list<size_t>{0, 1, 2})) | |||||
#undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST | |||||
#define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ | |||||
template <> \ | |||||
class LayoutsModifier<opr> { \ | |||||
public: \ | |||||
using FixedTensorLayouts = \ | |||||
typename AlgoChooser<opr>::FixedTensorLayouts; \ | |||||
static void on(FixedTensorLayouts& layouts, const opr::Param& param, \ | |||||
size_t new_batch_size) { \ | |||||
size_t batch_index = index_of_batch(param); \ | |||||
for (size_t index : sm_indices_contain_batch) { \ | |||||
layouts.at(index)[batch_index] = new_batch_size; \ | |||||
} \ | |||||
} \ | |||||
\ | |||||
private: \ | |||||
static size_t index_of_batch(const opr::Param& param) { \ | |||||
if (param.format == opr::Param::Format::CHWN4) { \ | |||||
return 3; \ | |||||
} \ | |||||
return 0; \ | |||||
} \ | |||||
static std::vector<size_t> sm_indices_contain_batch; \ | |||||
}; \ | |||||
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs; | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionForward, | |||||
(std::initializer_list<size_t>{0, 2})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardData, | |||||
(std::initializer_list<size_t>{1, 2})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardFilter, | |||||
(std::initializer_list<size_t>{0, 1})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareForward, | |||||
(std::initializer_list<size_t>{0, 2})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardData, | |||||
(std::initializer_list<size_t>{1, 2})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardFilter, | |||||
(std::initializer_list<size_t>{0, 1})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvForward, | |||||
(std::initializer_list<size_t>{0, 2, 3, | |||||
4})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardData, | |||||
(std::initializer_list<size_t>{0, 2, 3, 4, | |||||
5, 6, 7})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardFilter, | |||||
(std::initializer_list<size_t>{0, 1, 2, | |||||
3})) | |||||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchConvBiasForward, | |||||
(std::initializer_list<size_t>{0, 1, 2, 3, | |||||
4})) | |||||
#undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST | |||||
template <> | |||||
class LayoutsModifier<megdnn::ConvBiasForward> { | |||||
public: | |||||
using FixedTensorLayouts = | |||||
typename AlgoChooser<megdnn::ConvBiasForward>::FixedTensorLayouts; | |||||
static void on(FixedTensorLayouts& layouts, | |||||
const megdnn::ConvBiasForward::Param& param, | |||||
size_t new_batch_size) { | |||||
size_t batch_index = index_of_batch(param); | |||||
for (size_t index : sm_indices_contain_batch) { | |||||
layouts.at(index)[batch_index] = new_batch_size; | |||||
} | |||||
for (size_t index : sm_indices_contain_batch_broadcast) { | |||||
if (!check_bias_share_in_channel(layouts.at(index), param.format)) { | |||||
layouts.at(index)[batch_index] = new_batch_size; | |||||
} | |||||
} | |||||
} | |||||
private: | |||||
static std::vector<size_t> sm_indices_contain_batch; | |||||
static std::vector<size_t> sm_indices_contain_batch_broadcast; | |||||
static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) { | |||||
if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) { | |||||
return 3; | |||||
} | |||||
return 0; | |||||
} | |||||
}; | |||||
std::vector<size_t> | |||||
LayoutsModifier<megdnn::ConvBiasForward>::sm_indices_contain_batch = { | |||||
0, 3, 4}; | |||||
std::vector<size_t> LayoutsModifier< | |||||
megdnn::ConvBiasForward>::sm_indices_contain_batch_broadcast = {2}; | |||||
template <> | |||||
class LayoutsModifier<megdnn::MatrixMul> { | |||||
public: | |||||
using FixedTensorLayouts= | |||||
typename AlgoChooser<megdnn::MatrixMul>::FixedTensorLayouts; | |||||
static void on(FixedTensorLayouts& layouts, | |||||
const megdnn::MatrixMul::Param& param, | |||||
size_t new_batch_size) { | |||||
//! Because we do not know whether the batch size is in the dimension m | |||||
//! or the dimension n, we just ignore both m and n here. | |||||
// FIXME Find a way to make mgb obtain batch size information from R or | |||||
// automatically | |||||
layouts.at(2)[0] = new_batch_size; | |||||
layouts.at(2)[1] = new_batch_size; | |||||
if (param.transposeA) { | |||||
layouts.at(0)[1] = new_batch_size; | |||||
} else { | |||||
layouts.at(0)[0] = new_batch_size; | |||||
} | |||||
if (param.transposeB) { | |||||
layouts.at(1)[0] = new_batch_size; | |||||
} else { | |||||
layouts.at(1)[1] = new_batch_size; | |||||
} | |||||
} | |||||
}; | |||||
///////////////////////////// AlgoChooserHelper ////////////////////////// | ///////////////////////////// AlgoChooserHelper ////////////////////////// | ||||
template <typename Opr> | template <typename Opr> | ||||
AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | ||||
@@ -331,14 +502,25 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||||
const CompNode& cn, | const CompNode& cn, | ||||
const megdnn::param::ExecutionPolicy& execution_policy, | const megdnn::param::ExecutionPolicy& execution_policy, | ||||
bool allow_weight_preprocess) | bool allow_weight_preprocess) | ||||
: m_layouts{layouts}, | |||||
: m_fastrun_layouts{layouts}, | |||||
m_incache_layouts{layouts}, | |||||
m_dnn_opr{megdnn_opr}, | m_dnn_opr{megdnn_opr}, | ||||
m_param{param_str}, | m_param{param_str}, | ||||
m_base_mgb_opr{mgb_opr}, | m_base_mgb_opr{mgb_opr}, | ||||
m_cn{cn}, | m_cn{cn}, | ||||
m_execution_policy{execution_policy}, | m_execution_policy{execution_policy}, | ||||
m_allow_weight_preprocess{allow_weight_preprocess} { | m_allow_weight_preprocess{allow_weight_preprocess} { | ||||
mgb_assert(m_layouts.size() == layouts.size()); | |||||
auto fastrun_batch_size = | |||||
owner_graph()->options().fast_run_config.shared_batch_size; | |||||
if (fastrun_batch_size) { | |||||
LayoutsModifier<Opr>::on(m_incache_layouts, m_dnn_opr->param(), 0); | |||||
LayoutsModifier<Opr>::on(m_fastrun_layouts, m_dnn_opr->param(), | |||||
fastrun_batch_size); | |||||
} | |||||
mgb_assert(m_fastrun_layouts.size() == layouts.size()); | |||||
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | ||||
std::tuple_size<FixedTensorLayouts>::value == 5 || | std::tuple_size<FixedTensorLayouts>::value == 5 || | ||||
std::tuple_size<FixedTensorLayouts>::value == 8, | std::tuple_size<FixedTensorLayouts>::value == 8, | ||||
@@ -358,13 +540,13 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_heuristic( | |||||
policy.algo = | policy.algo = | ||||
APPLY(m_dnn_opr->get_algorithm_info_heuristic( | APPLY(m_dnn_opr->get_algorithm_info_heuristic( | ||||
args..., workspace_limit, attr.first, attr.second), | args..., workspace_limit, attr.first, attr.second), | ||||
m_layouts) | |||||
m_fastrun_layouts) | |||||
.desc; | .desc; | ||||
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | ||||
mgb_assert(algo, "Unknown algo description"); | mgb_assert(algo, "Unknown algo description"); | ||||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | ||||
to_layout_array<Opr>(m_layouts), m_dnn_opr); | |||||
to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | FOREACH_OPR_TYPE_DISPATCH(sub_items, { | ||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | ||||
@@ -393,7 +575,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_profile( | |||||
if (policy.algo.valid()) { | if (policy.algo.valid()) { | ||||
return policy; | return policy; | ||||
} | } | ||||
if (!algo_usable_on_shape_change<Opr>()) { | |||||
if (is_matmul<Opr>()) { | |||||
mgb_log_warn( | mgb_log_warn( | ||||
"choose algo by heuristic, which may cause performance " | "choose algo by heuristic, which may cause performance " | ||||
"regression."); | "regression."); | ||||
@@ -442,7 +624,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache( | |||||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | ||||
typename Opr::Param origin_param = m_dnn_opr->param(); | typename Opr::Param origin_param = m_dnn_opr->param(); | ||||
AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(), | |||||
AlgoChooserProfileCache::Key cache_key{m_incache_layouts.data(), | |||||
m_incache_layouts.size(), | |||||
&origin_param, sizeof(origin_param)}; | &origin_param, sizeof(origin_param)}; | ||||
auto&& rst = cache.get(cache_key); | auto&& rst = cache.get(cache_key); | ||||
if (!rst.valid()) | if (!rst.valid()) | ||||
@@ -472,21 +655,21 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache( | |||||
} | } | ||||
std::string layouts_str = | std::string layouts_str = | ||||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out); | |||||
format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out); | |||||
if (skip_by_negative) { | if (skip_by_negative) { | ||||
mgb_log_error( | mgb_log_error( | ||||
"opr: %s, layouts: %s, No usable algo. There are available algos match " | |||||
"opr: %s, layouts: %s, No usable algo. There are available " | |||||
"algos match " | |||||
"positive strategy(%s), but filtered by negative stategy(%s).", | "positive strategy(%s), but filtered by negative stategy(%s).", | ||||
m_base_mgb_opr->dyn_typeinfo()->name, | |||||
layouts_str.c_str(), | |||||
m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), | |||||
Algorithm::attribute_str(target_attr.first).c_str(), | Algorithm::attribute_str(target_attr.first).c_str(), | ||||
Algorithm::attribute_str(target_attr.second).c_str()); | Algorithm::attribute_str(target_attr.second).c_str()); | ||||
} else { | } else { | ||||
mgb_log_error( | mgb_log_error( | ||||
"opr: %s, layouts: %s, No usable algo. algos read from cache could not " | |||||
"opr: %s, layouts: %s, No usable algo. algos read from cache " | |||||
"could not " | |||||
"satisfy positive strategy(%s)", | "satisfy positive strategy(%s)", | ||||
m_base_mgb_opr->dyn_typeinfo()->name, | |||||
layouts_str.c_str(), | |||||
m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), | |||||
Algorithm::attribute_str(target_attr.first).c_str()); | Algorithm::attribute_str(target_attr.first).c_str()); | ||||
} | } | ||||
@@ -508,7 +691,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||||
auto target_attr = | auto target_attr = | ||||
extract_algo_attribute(selected_strategy); | extract_algo_attribute(selected_strategy); | ||||
std::string layouts_str = format_fixlayouts<Opr>( | std::string layouts_str = format_fixlayouts<Opr>( | ||||
m_layouts, arity_in, arity_out); | |||||
m_fastrun_layouts, arity_in, arity_out); | |||||
std::string msg = ssprintf( | std::string msg = ssprintf( | ||||
"(opr : %s, layouts %s, with attribute(%s) and " | "(opr : %s, layouts %s, with attribute(%s) and " | ||||
"without attribute(%s)", | "without attribute(%s)", | ||||
@@ -535,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||||
policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic( | policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic( | ||||
args..., workspace_limit, attr.first, | args..., workspace_limit, attr.first, | ||||
attr.second), | attr.second), | ||||
m_layouts) | |||||
m_fastrun_layouts) | |||||
.desc; | .desc; | ||||
mgb_assert(policy.algo.valid(), | mgb_assert(policy.algo.valid(), | ||||
"No algo found from heuristic with strategy %u and " | "No algo found from heuristic with strategy %u and " | ||||
@@ -548,7 +731,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||||
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | ||||
mgb_assert(algo, "Unknown algo description"); | mgb_assert(algo, "Unknown algo description"); | ||||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | ||||
to_layout_array<Opr>(m_layouts), m_dnn_opr); | |||||
to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr); | |||||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | FOREACH_OPR_TYPE_DISPATCH(sub_items, { | ||||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | ||||
@@ -575,26 +758,32 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||||
template <typename Opr> | template <typename Opr> | ||||
size_t AlgoChooser<Opr>::AlgoChooserHelper::get_workspace_size_bytes( | size_t AlgoChooser<Opr>::AlgoChooserHelper::get_workspace_size_bytes( | ||||
const ImplExecutionPolicy& policy) const { | |||||
const ImplExecutionPolicy& policy, | |||||
const FixedTensorLayouts& layouts) const { | |||||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes"))) | MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes"))) | ||||
m_dnn_opr->execution_policy() = policy; | m_dnn_opr->execution_policy() = policy; | ||||
size_t result; | size_t result; | ||||
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; | |||||
if (layouts.at(0).ndim) { | |||||
layouts_ptr = &layouts; | |||||
} | |||||
if_constexpr<opr_supports_preprocess<Opr>()>( | if_constexpr<opr_supports_preprocess<Opr>()>( | ||||
[&](auto _) { | [&](auto _) { | ||||
auto&& opr = _(m_dnn_opr); | auto&& opr = _(m_dnn_opr); | ||||
auto prep = this->construct_fake_preprocess_filter(); | |||||
auto prep = | |||||
this->construct_fake_preprocess_filter(*layouts_ptr); | |||||
PreprocessFilter<Opr>* prep_ptr = | PreprocessFilter<Opr>* prep_ptr = | ||||
prep.valid() ? &prep.val() : nullptr; | prep.valid() ? &prep.val() : nullptr; | ||||
result = std::max( | result = std::max( | ||||
APPLY(opr->get_preprocess_workspace_in_bytes(args...), | APPLY(opr->get_preprocess_workspace_in_bytes(args...), | ||||
m_layouts), | |||||
*layouts_ptr), | |||||
APPLY(opr->get_workspace_in_bytes(args..., prep_ptr), | APPLY(opr->get_workspace_in_bytes(args..., prep_ptr), | ||||
m_layouts)); | |||||
*layouts_ptr)); | |||||
}, | }, | ||||
/* else */ | /* else */ | ||||
[&](auto _) { | [&](auto _) { | ||||
result = APPLY(_(m_dnn_opr)->get_workspace_in_bytes(args...), | result = APPLY(_(m_dnn_opr)->get_workspace_in_bytes(args...), | ||||
m_layouts); | |||||
*layouts_ptr); | |||||
}); | }); | ||||
return result; | return result; | ||||
MIDOUT_E | MIDOUT_E | ||||
@@ -605,8 +794,8 @@ std::vector<typename AlgoChooser<Opr>::ImplAlgo> | |||||
AlgoChooser<Opr>::AlgoChooserHelper::get_all_candidates() const { | AlgoChooser<Opr>::AlgoChooserHelper::get_all_candidates() const { | ||||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates"))) | MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates"))) | ||||
auto heu = choose_by_heuristic(m_execution_policy.strategy); | auto heu = choose_by_heuristic(m_execution_policy.strategy); | ||||
auto&& ret = | |||||
APPLY(m_dnn_opr->get_all_algorithms_info(args...), m_layouts); | |||||
auto&& ret = APPLY(m_dnn_opr->get_all_algorithms_info(args...), | |||||
m_fastrun_layouts); | |||||
bool found = false; | bool found = false; | ||||
for (size_t i = 0; i < ret.size(); ++i) { | for (size_t i = 0; i < ret.size(); ++i) { | ||||
if (ret[i].desc == heu.algo) { | if (ret[i].desc == heu.algo) { | ||||
@@ -637,7 +826,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo( | |||||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy); | TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy); | ||||
param.workspace = get_workspace_size_bytes(policy); | param.workspace = get_workspace_size_bytes(policy); | ||||
for (int i = 0; i < arity; ++i) { | for (int i = 0; i < arity; ++i) { | ||||
auto&& src = m_layouts[i]; | |||||
auto&& src = m_fastrun_layouts[i]; | |||||
bool cond_normal = src.format.is_default() && | bool cond_normal = src.format.is_default() && | ||||
(src.dtype.category() == DTypeCategory::FLOAT || | (src.dtype.category() == DTypeCategory::FLOAT || | ||||
src.dtype.category() == DTypeCategory::INT || | src.dtype.category() == DTypeCategory::INT || | ||||
@@ -655,9 +844,9 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo( | |||||
param.dtypes[i] = src.dtype.enumv(); | param.dtypes[i] = src.dtype.enumv(); | ||||
} | } | ||||
param.comp_node_loc = m_cn.locator(); | param.comp_node_loc = m_cn.locator(); | ||||
mgb_assert(param.shapes.size() == m_layouts.size()); | |||||
mgb_assert(param.shapes.size() == m_fastrun_layouts.size()); | |||||
for (size_t i = 0; i < param.shapes.size(); ++i) | for (size_t i = 0; i < param.shapes.size(); ++i) | ||||
param.shapes[i] = m_layouts[i]; | |||||
param.shapes[i] = m_fastrun_layouts[i]; | |||||
param.opr_param = m_dnn_opr->param(); | param.opr_param = m_dnn_opr->param(); | ||||
param.allow_weight_preprocess = m_allow_weight_preprocess; | param.allow_weight_preprocess = m_allow_weight_preprocess; | ||||
@@ -692,7 +881,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
auto target_attr = extract_algo_attribute(selected_strategy); | auto target_attr = extract_algo_attribute(selected_strategy); | ||||
std::string layouts_str = | std::string layouts_str = | ||||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out); | |||||
format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out); | |||||
double cur_timeout = 0; | double cur_timeout = 0; | ||||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | ||||
@@ -761,10 +950,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
workspace_limit); | workspace_limit); | ||||
mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); | mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); | ||||
FixedTensorLayouts origin_layouts = m_layouts; | |||||
FixedTensorLayouts incache_layouts = m_incache_layouts; | |||||
typename Opr::Param origin_param = m_dnn_opr->param(); | typename Opr::Param origin_param = m_dnn_opr->param(); | ||||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||||
origin_layouts.size(), &origin_param, | |||||
AlgoChooserProfileCache::Key cache_key{incache_layouts.data(), | |||||
incache_layouts.size(), &origin_param, | |||||
sizeof(origin_param)}; | sizeof(origin_param)}; | ||||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | ||||
@@ -774,15 +963,20 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
template <typename Opr> | template <typename Opr> | ||||
Maybe<PreprocessFilter<Opr>> | Maybe<PreprocessFilter<Opr>> | ||||
AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter() const { | |||||
AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter( | |||||
const FixedTensorLayouts& layouts) const { | |||||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter"))) | MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter"))) | ||||
Maybe<PreprocessFilter<Opr>> result = None; | Maybe<PreprocessFilter<Opr>> result = None; | ||||
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; | |||||
if (layouts.at(0).ndim) { | |||||
layouts_ptr = &layouts; | |||||
} | |||||
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) { | if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) { | ||||
if (!m_allow_weight_preprocess) | if (!m_allow_weight_preprocess) | ||||
return; | return; | ||||
auto opr = _(m_dnn_opr); | auto opr = _(m_dnn_opr); | ||||
auto layouts = APPLY(opr->deduce_preprocessed_filter_layout(args...), | auto layouts = APPLY(opr->deduce_preprocessed_filter_layout(args...), | ||||
m_layouts); | |||||
*layouts_ptr); | |||||
//! No preprocess layout means no need weight preprocess | //! No preprocess layout means no need weight preprocess | ||||
if (layouts.empty()) { | if (layouts.empty()) { | ||||
return; | return; | ||||
@@ -825,6 +1019,16 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute( | |||||
ret.second |= AlgoAttribute::NAIVE; | ret.second |= AlgoAttribute::NAIVE; | ||||
} | } | ||||
//! from graph option | |||||
if (owner_graph()->options().fast_run_config.shared_batch_size) { | |||||
ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE; | |||||
} | |||||
if (owner_graph()->options().fast_run_config.binary_equal_between_batch) { | |||||
ret.first |= AlgoAttribute::REPRODUCIBLE; | |||||
ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||||
} | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -854,7 +1058,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute( | |||||
template size_t \ | template size_t \ | ||||
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_workspace_size_bytes( \ | AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_workspace_size_bytes( \ | ||||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | ||||
policy) const; \ | |||||
policy, \ | |||||
const FixedTensorLayouts& layouts) const; \ | |||||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | ||||
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_all_candidates() const; \ | AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_all_candidates() const; \ | ||||
template Maybe<AlgoChooserProfileCache::ResultEntry> \ | template Maybe<AlgoChooserProfileCache::ResultEntry> \ | ||||
@@ -942,10 +1147,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||||
if (!policy.algo.valid()) { | if (!policy.algo.valid()) { | ||||
policy = get_policy(helper); | policy = get_policy(helper); | ||||
} | } | ||||
size_t workspace = helper.get_workspace_size_bytes(policy); | |||||
size_t workspace = helper.get_workspace_size_bytes(policy, layouts); | |||||
std::string ret; | std::string ret; | ||||
ret.append(mgb_opr->dyn_typeinfo()->name); | ret.append(mgb_opr->dyn_typeinfo()->name); | ||||
ret.append(": tensor layouts"); | |||||
ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out); | ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out); | ||||
Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); | Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); | ||||
mgb_assert(palgo, "Unknown algo description"); | mgb_assert(palgo, "Unknown algo description"); | ||||
@@ -68,7 +68,10 @@ class AlgoChooser { | |||||
public: | public: | ||||
using FixedTensorLayouts = std::array<TensorLayout, arity>; | using FixedTensorLayouts = std::array<TensorLayout, arity>; | ||||
class AlgoChooserHelper { | class AlgoChooserHelper { | ||||
FixedTensorLayouts m_layouts; | |||||
//! fastrun layouts | |||||
FixedTensorLayouts m_fastrun_layouts; | |||||
//! layouts used when get and set cache item | |||||
FixedTensorLayouts m_incache_layouts; | |||||
Opr* m_dnn_opr; | Opr* m_dnn_opr; | ||||
std::string m_param; | std::string m_param; | ||||
const cg::OperatorNodeBase* m_base_mgb_opr; | const cg::OperatorNodeBase* m_base_mgb_opr; | ||||
@@ -89,7 +92,7 @@ public: | |||||
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | ||||
const TensorLayout& inp_layout(size_t idx) const { | const TensorLayout& inp_layout(size_t idx) const { | ||||
return m_layouts[idx]; | |||||
return m_fastrun_layouts[idx]; | |||||
} | } | ||||
cg::ComputingGraph* owner_graph() const { | cg::ComputingGraph* owner_graph() const { | ||||
return m_base_mgb_opr->owner_graph(); | return m_base_mgb_opr->owner_graph(); | ||||
@@ -109,7 +112,13 @@ public: | |||||
return m_dnn_opr->get_algorithm_from_desc(desc); | return m_dnn_opr->get_algorithm_from_desc(desc); | ||||
} | } | ||||
const FixedTensorLayouts& layouts() const { return m_layouts; } | |||||
const FixedTensorLayouts& fastrun_layouts() const { | |||||
return m_fastrun_layouts; | |||||
} | |||||
const FixedTensorLayouts& incache_layouts() const { | |||||
return m_incache_layouts; | |||||
} | |||||
//! construct algo chain by heuristic | //! construct algo chain by heuristic | ||||
ImplExecutionPolicy choose_by_heuristic( | ImplExecutionPolicy choose_by_heuristic( | ||||
@@ -141,7 +150,8 @@ public: | |||||
//! get workspace size required for specific execution policy | //! get workspace size required for specific execution policy | ||||
size_t get_workspace_size_bytes( | size_t get_workspace_size_bytes( | ||||
const ImplExecutionPolicy& policy) const; | |||||
const ImplExecutionPolicy& policy, | |||||
const FixedTensorLayouts& layouts = {}) const; | |||||
//! get all candidate algos, and the one choose_by_heuristic() is | //! get all candidate algos, and the one choose_by_heuristic() is | ||||
//! put first | //! put first | ||||
@@ -173,7 +183,8 @@ public: | |||||
const ExecutionStrategy& strategy) const; | const ExecutionStrategy& strategy) const; | ||||
private: | private: | ||||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | |||||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter( | |||||
const FixedTensorLayouts& layouts = {}) const; | |||||
}; | }; | ||||
template <typename U> | template <typename U> | ||||
@@ -54,11 +54,11 @@ constexpr bool opr_contain_bias() { | |||||
return std::is_same<Opr, megdnn::ConvBias>::value; | return std::is_same<Opr, megdnn::ConvBias>::value; | ||||
} | } | ||||
//! matmul and batchedMatrixMul may not be usable once shape changed | |||||
//! matmul and batchedMatrixMul | |||||
template <typename Opr> | template <typename Opr> | ||||
constexpr bool algo_usable_on_shape_change() { | |||||
return !(std::is_same<Opr, megdnn::MatrixMul>::value || | |||||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value); | |||||
constexpr bool is_matmul() { | |||||
return std::is_same<Opr, megdnn::MatrixMul>::value || | |||||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value; | |||||
} | } | ||||
template <typename Opr, bool has_prep> | template <typename Opr, bool has_prep> | ||||
@@ -0,0 +1,304 @@ | |||||
/** | |||||
* \file src/opr/test/algo_chooser.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "megbrain/comp_node_env.h" | |||||
#include "megbrain/opr/blas.h" | |||||
#include "megbrain/opr/dnn/convolution.h" | |||||
#include "megbrain/test/autocheck.h" | |||||
#include "megbrain/test/helper.h" | |||||
#include "megbrain/test/megdnn_helper.h" | |||||
#include "megbrain/serialization/serializer.h" | |||||
#include "megbrain/opr/basic_arith.h" | |||||
#include "megbrain/gopt/inference.h" | |||||
#include "megbrain/opr/tensor_manip.h" | |||||
#include "megdnn/oprs/base.h" | |||||
#include "megdnn/dtype.h" | |||||
#include <cmath> | |||||
#include <random> | |||||
#include <utility> | |||||
using namespace mgb; | |||||
namespace { | |||||
#if MGB_CUDA | |||||
#if MGB_ENABLE_FASTRUN | |||||
template <typename MgbOpr, int arith> | |||||
struct GraphMaker; | |||||
template <typename MgbOpr> | |||||
struct GraphMaker<MgbOpr, 2> { | |||||
SymbolVar operator()(const std::array<cg::SymbolVar, 2>& inputs, | |||||
typename MgbOpr::Param& param, | |||||
typename MgbOpr::ExecutionPolicy& policy) { | |||||
return MgbOpr::make(inputs[0], inputs[1], param, policy); | |||||
} | |||||
}; | |||||
template <> | |||||
struct GraphMaker<opr::ConvolutionBackwardData, 2> { | |||||
SymbolVar operator()( | |||||
const std::array<cg::SymbolVar, 2>& inputs, | |||||
opr::ConvolutionBackwardData::Param& param, | |||||
opr::ConvolutionBackwardData::ExecutionPolicy& policy) { | |||||
return opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1], | |||||
param, policy); | |||||
} | |||||
}; | |||||
template <> | |||||
struct GraphMaker<opr::Convolution3DBackwardData, 2> { | |||||
SymbolVar operator()( | |||||
const std::array<cg::SymbolVar, 2>& inputs, | |||||
opr::Convolution3DBackwardData::Param& param, | |||||
opr::Convolution3DBackwardData::ExecutionPolicy& policy) { | |||||
return opr::Convolution3DBackwardData::make_deconv(inputs[0], inputs[1], | |||||
param, policy); | |||||
} | |||||
}; | |||||
template <typename MgbOpr> | |||||
struct GraphMaker<MgbOpr, 3> { | |||||
SymbolVar operator()(const std::array<cg::SymbolVar, 3>& inputs, | |||||
typename MgbOpr::Param& param, | |||||
typename MgbOpr::ExecutionPolicy& policy) { | |||||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], param, policy, {}); | |||||
} | |||||
}; | |||||
template <typename MgbOpr> | |||||
struct GraphMaker<MgbOpr, 4> { | |||||
SymbolVar operator()(const std::array<cg::SymbolVar, 4>& inputs, | |||||
typename MgbOpr::Param& param, | |||||
typename MgbOpr::ExecutionPolicy& policy) { | |||||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, | |||||
policy, {}); | |||||
} | |||||
}; | |||||
template <typename MgbOpr> | |||||
struct GraphMaker<MgbOpr, 5> { | |||||
SymbolVar operator()(const std::array<cg::SymbolVar, 5>& inputs, | |||||
typename MgbOpr::Param& param, | |||||
typename MgbOpr::ExecutionPolicy& policy) { | |||||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], | |||||
inputs[4], param, policy, {}); | |||||
} | |||||
}; | |||||
template <typename MgbOpr, int arith, typename dtype = dtype::Float32> | |||||
void test_fastrun_opr(std::array<TensorShape, arith> inps0, | |||||
std::array<TensorShape, arith> inps1, | |||||
size_t expect_nr_cache_set_inp0 = 0, | |||||
size_t expect_nr_cache_set_inp1 = 0, | |||||
typename MgbOpr::Param param = {}) { | |||||
using Policy = opr::Convolution::ExecutionPolicy; | |||||
using S = Policy::Strategy; | |||||
using InputGenerator = std::function<void(HostTensorND & dest)>; | |||||
using ShapeInpArray = std::array<TensorShape, arith>; | |||||
using CacheMem = std::pair<const void*, size_t>; | |||||
auto on_get = [](const std::string&, const void*, size_t, const void*, | |||||
size_t) {}; | |||||
std::vector<std::pair<CacheMem, CacheMem>> cache_set_history; | |||||
auto on_set = [&cache_set_history](const std::string&, const void* key, | |||||
size_t key_size, const void* val, | |||||
size_t val_size) { | |||||
cache_set_history.emplace_back(std::make_pair(key, key_size), | |||||
std::make_pair(val, val_size)); | |||||
}; | |||||
PersistentCacheHook cache_hook{on_get, on_set}; | |||||
CompNode comp_node = CompNode::load("xpu0"); | |||||
GraphMaker<MgbOpr, arith> graph_maker; | |||||
auto run = [¶m, &comp_node, &graph_maker]( | |||||
const std::shared_ptr<cg::ComputingGraph>& graph, | |||||
const ShapeInpArray& shapes) { | |||||
std::array<InputGenerator, arith> inputs_generator; | |||||
std::array<std::shared_ptr<HostTensorND>, arith> inputs; | |||||
for (size_t i = 0; i < arith; ++i) { | |||||
inputs[i] = std::make_shared<HostTensorND>(comp_node, | |||||
dtype()); | |||||
} | |||||
HostTensorGenerator<dtype> gen_host; | |||||
for (size_t i = 0; i < arith; ++i) { | |||||
inputs[i]->resize(shapes[i]); | |||||
*inputs[i] = *gen_host(inputs[i]->shape(), comp_node); | |||||
mgb_assert(inputs[i]->shape().eq_shape(shapes[i])); | |||||
} | |||||
std::array<cg::SymbolVar, arith> sym_in; | |||||
for (size_t i = 0; i < arith; ++i) { | |||||
// to trigger graph trans | |||||
sym_in[i] = opr::Host2DeviceCopy::make(*graph, inputs[i], | |||||
ssprintf("inp%zu", i)); | |||||
} | |||||
Policy policy; | |||||
policy.strategy = S::PROFILE; | |||||
auto out = graph_maker(sym_in, param, policy); | |||||
std::unique_ptr<cg::AsyncExecutable> func = | |||||
graph->compile({{out, {}}}); | |||||
func->execute(); | |||||
}; | |||||
std::shared_ptr<cg::ComputingGraph> fastrun_ignore_batchsize_graph = | |||||
ComputingGraph::make(); | |||||
fastrun_ignore_batchsize_graph->options() | |||||
.fast_run_config.shared_batch_size = 20; | |||||
run(fastrun_ignore_batchsize_graph, inps0); | |||||
size_t nr_set_inp0 = cache_set_history.size(); | |||||
if (expect_nr_cache_set_inp0) { | |||||
ASSERT_EQ(cache_set_history.size(), expect_nr_cache_set_inp0); | |||||
} | |||||
run(fastrun_ignore_batchsize_graph, inps1); | |||||
size_t nr_set_total = expect_nr_cache_set_inp1 + nr_set_inp0; | |||||
ASSERT_EQ(cache_set_history.size(), nr_set_total); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution) { | |||||
REQUIRE_GPU(1); | |||||
test_fastrun_opr<opr::Convolution, 2>( | |||||
{TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}}, | |||||
{TensorShape{1, 3, 36, 36}, TensorShape{4, 3, 3, 3}}); | |||||
test_fastrun_opr<opr::ConvolutionBackwardData, 2>( | |||||
{TensorShape{12, 4, 23, 29}, TensorShape{4, 5, 3, 2}}, | |||||
{TensorShape{2, 4, 23, 29}, TensorShape{4, 5, 3, 2}}); | |||||
test_fastrun_opr<opr::ConvolutionBackwardFilter, 3>( | |||||
{TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28}, | |||||
TensorShape{5, 4, 3, 2}}, | |||||
{TensorShape{2, 4, 23, 29}, TensorShape{2, 5, 21, 28}, | |||||
TensorShape{5, 4, 3, 2}}); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvBias) { | |||||
REQUIRE_GPU(1); | |||||
test_fastrun_opr<opr::ConvBias, 3>( | |||||
{TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3}, | |||||
TensorShape{1, 24, 1, 1}}, | |||||
{TensorShape{1, 16, 50, 50}, TensorShape{24, 16, 3, 3}, | |||||
TensorShape{1, 24, 1, 1}}); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution3D) { | |||||
REQUIRE_GPU(1); | |||||
test_fastrun_opr<opr::Convolution3D, 2>( | |||||
{TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}}, | |||||
{TensorShape{3, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}}); | |||||
test_fastrun_opr<opr::Convolution3DBackwardData, 2>( | |||||
{TensorShape{14, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}}, | |||||
{TensorShape{4, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}}); | |||||
test_fastrun_opr<opr::Convolution3DBackwardFilter, 3>( | |||||
{TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18}, | |||||
TensorShape{16, 16, 1, 1, 1}}, | |||||
{TensorShape{4, 16, 18, 18, 18}, TensorShape{4, 16, 18, 18, 18}, | |||||
TensorShape{16, 16, 1, 1, 1}}); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeLocalShare) { | |||||
REQUIRE_GPU(1); | |||||
opr::LocalShare::Param local_share_param; | |||||
local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION; | |||||
local_share_param.pad_h = local_share_param.pad_w = 1; | |||||
local_share_param.stride_h = local_share_param.stride_w = 1; | |||||
local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2; | |||||
test_fastrun_opr<opr::LocalShareForward, 2>( | |||||
{TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, | |||||
{TensorShape{3, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, 0, 0, | |||||
local_share_param); | |||||
test_fastrun_opr<opr::LocalShareBackwardData, 3>( | |||||
{TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24}, | |||||
TensorShape{32, 128, 24, 24}}, | |||||
{TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{2, 128, 24, 24}, | |||||
TensorShape{2, 128, 24, 24}}); | |||||
test_fastrun_opr<opr::LocalShareBackwardFilter, 3>( | |||||
{TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35}, | |||||
TensorShape{3, 3, 3, 3, 3, 4}}, | |||||
{TensorShape{4, 3, 36, 36}, TensorShape{4, 4, 35, 35}, | |||||
TensorShape{3, 3, 3, 3, 3, 4}}); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeDeformableConv) { | |||||
REQUIRE_GPU(1); | |||||
test_fastrun_opr<opr::DeformableConvForward, 4>( | |||||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}}, | |||||
{TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||||
TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}}); | |||||
test_fastrun_opr<opr::DeformableConvBackwardData, 5>( | |||||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}, | |||||
TensorShape{12, 6, 18, 18}}, | |||||
{TensorShape{4, 6, 20, 20}, | |||||
TensorShape{6, 6, 3, 3}, | |||||
TensorShape{4, 18, 18, 18}, | |||||
TensorShape{4, 9, 18, 18}, | |||||
TensorShape{4, 6, 18, 18}}); | |||||
test_fastrun_opr<opr::DeformableConvBackwardFilter, 5>( | |||||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}, | |||||
TensorShape{12, 6, 18, 18}}, | |||||
{TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||||
TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}, | |||||
TensorShape{4, 6, 18, 18}}); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeMatrixMul) { | |||||
REQUIRE_GPU(1); | |||||
//! fastrun_shared_batch_size == 20 | |||||
//! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin | |||||
//! {12(10), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA | |||||
//! {12(10), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB | |||||
//! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB | |||||
//! | |||||
//! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin duplicate | |||||
//! {12(4), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA | |||||
//! {12(4), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB | |||||
//! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB duplicate | |||||
test_fastrun_opr<opr::MatrixMul, 2>( | |||||
{TensorShape{10, 12}, TensorShape{12, 12}}, | |||||
{TensorShape{4, 12}, TensorShape{12, 12}}, 4, 2); | |||||
} | |||||
TEST(TestOprDNN, FastrunIgnoreBatchSizeBatchedMatrixMul) { | |||||
REQUIRE_GPU(1); | |||||
//! fastrun_shared_batch_size == 20 | |||||
//! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin | |||||
//! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA | |||||
//! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB | |||||
//! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB | |||||
//! | |||||
//! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin duplicate | |||||
//! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA duplicate | |||||
//! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB duplicate | |||||
//! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB duplicate | |||||
test_fastrun_opr<opr::BatchedMatrixMul, 2>( | |||||
{TensorShape{12, 6, 8}, TensorShape{12, 8, 4}}, | |||||
{TensorShape{4, 6, 8}, TensorShape{4, 8, 4}}); | |||||
} | |||||
#endif // MGB_ENABLE_FASTRUN | |||||
#endif // MGB_CUDA | |||||
} // anonymous namespace | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -460,12 +460,13 @@ mgb::make_callback_copy(SymbolVar dev, HostTensorND &host, bool sync) { | |||||
/* ========================== PersistentCacheHook ========================== */ | /* ========================== PersistentCacheHook ========================== */ | ||||
class PersistentCacheHook::HookedImpl final : public PersistentCache { | class PersistentCacheHook::HookedImpl final : public PersistentCache { | ||||
GetHook m_on_get; | |||||
Hook m_on_get, m_on_set; | |||||
public: | public: | ||||
std::shared_ptr<PersistentCache> orig_impl; | std::shared_ptr<PersistentCache> orig_impl; | ||||
HookedImpl(GetHook on_get) : m_on_get{std::move(on_get)} {} | |||||
HookedImpl(Hook on_get, Hook on_set) | |||||
: m_on_get{std::move(on_get)}, m_on_set{std::move(on_set)} {} | |||||
Maybe<Blob> get(const std::string& category, const Blob& key) override { | Maybe<Blob> get(const std::string& category, const Blob& key) override { | ||||
auto ret = orig_impl->get(category, key); | auto ret = orig_impl->get(category, key); | ||||
@@ -476,12 +477,18 @@ public: | |||||
void put(const std::string& category, const Blob& key, | void put(const std::string& category, const Blob& key, | ||||
const Blob& value) override { | const Blob& value) override { | ||||
m_on_set(category, key.ptr, key.size, value.ptr, | |||||
value.size); | |||||
orig_impl->put(category, key, value); | orig_impl->put(category, key, value); | ||||
} | } | ||||
}; | }; | ||||
PersistentCacheHook::PersistentCacheHook(GetHook on_get) | |||||
: m_impl{std::make_shared<HookedImpl>(std::move(on_get))} { | |||||
PersistentCacheHook::Hook PersistentCacheHook::default_set_hook = | |||||
[](const std::string&, const void*, size_t, const void*, size_t) {}; | |||||
PersistentCacheHook::PersistentCacheHook(Hook on_get, Hook on_set) | |||||
: m_impl{std::make_shared<HookedImpl>(std::move(on_get), | |||||
std::move(on_set))} { | |||||
m_impl->orig_impl = PersistentCache::set_impl(m_impl); | m_impl->orig_impl = PersistentCache::set_impl(m_impl); | ||||
} | } | ||||
@@ -512,17 +512,17 @@ bool check_device_type_avaiable(CompNode::DeviceType device_type); | |||||
//! hook persistent cache get calls during the lifetime | //! hook persistent cache get calls during the lifetime | ||||
class PersistentCacheHook { | class PersistentCacheHook { | ||||
class HookedImpl; | |||||
std::shared_ptr<HookedImpl> m_impl; | |||||
public: | public: | ||||
//! if value is not available, \p val and \p val_size would be zero | |||||
using GetHook = thin_function<void(const std::string& category, | |||||
const void* key, size_t key_size, | |||||
const void* val, size_t val_size)>; | |||||
PersistentCacheHook(GetHook on_get); | |||||
using Hook = thin_function<void(const std::string& category, | |||||
const void* key, size_t key_size, | |||||
const void* val, size_t val_size)>; | |||||
PersistentCacheHook(Hook on_get, Hook on_set = default_set_hook); | |||||
~PersistentCacheHook(); | ~PersistentCacheHook(); | ||||
private: | |||||
static Hook default_set_hook; | |||||
class HookedImpl; | |||||
std::shared_ptr<HookedImpl> m_impl; | |||||
}; | }; | ||||
//! skip a testcase if xpu not available | //! skip a testcase if xpu not available | ||||
#define REQUIRE_XPU(n) do { \ | #define REQUIRE_XPU(n) do { \ | ||||