GitOrigin-RevId: 94e333ec80
release-1.5
@@ -91,6 +91,11 @@ class MaxTensorDiff : public OperatorBase { | |||
void check_exec(const TensorLayout& layout1, | |||
const TensorLayout& layout2, size_t workspace_in_bytes); | |||
}; | |||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||
const param::ConvBias::Format format); | |||
} // namespace megdnn | |||
#include "megdnn/internal/opr_header_epilogue.h" | |||
@@ -318,36 +318,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
megdnn_assert(false); | |||
} | |||
} | |||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||
const param::ConvBias::Format format) { | |||
bool share_in_channel = false; | |||
if (format == param::ConvBias::Format::NCHW || | |||
format == param::ConvBias::Format::NCHW4_NCHW) { | |||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} else if (format == param::ConvBias::Format::NHWC) { | |||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||
bias[2] == 1); | |||
} else if (format == param::ConvBias::Format::NCHW4 || | |||
format == param::ConvBias::Format::NCHW8 || | |||
format == param::ConvBias::Format::NCHW32 || | |||
format == param::ConvBias::Format::NCHW64 || | |||
format == param::ConvBias::Format::NCHW4_NCHW32 || | |||
format == param::ConvBias::Format::NCHW32_NCHW4) { | |||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} else if (format == param::ConvBias::Format::NHWCD4) { | |||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||
bias[3] == 1); | |||
} else { | |||
megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||
share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} | |||
return share_in_channel; | |||
} | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -22,8 +22,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
const TensorND* dst_tensor, | |||
const TensorND* bias_tensor); | |||
bool check_bias_share_in_channel(const TensorLayout& bias, | |||
const param::ConvBias::Format format); | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -10,6 +10,7 @@ | |||
*/ | |||
#include "src/common/utils.h" | |||
#include "megdnn/oprs/utils.h" | |||
#include "megdnn/handle.h" | |||
#include <cstdarg> | |||
@@ -344,4 +345,33 @@ size_t& CpuNDRange::operator[](size_t idx) { | |||
return m_dim[idx]; | |||
} | |||
bool megdnn::check_bias_share_in_channel(const TensorLayout& bias, | |||
const param::ConvBias::Format format) { | |||
bool share_in_channel = false; | |||
if (format == param::ConvBias::Format::NCHW || | |||
format == param::ConvBias::Format::NCHW4_NCHW) { | |||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} else if (format == param::ConvBias::Format::NHWC) { | |||
share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||
bias[2] == 1); | |||
} else if (format == param::ConvBias::Format::NCHW4 || | |||
format == param::ConvBias::Format::NCHW8 || | |||
format == param::ConvBias::Format::NCHW32 || | |||
format == param::ConvBias::Format::NCHW64 || | |||
format == param::ConvBias::Format::NCHW4_NCHW32 || | |||
format == param::ConvBias::Format::NCHW32_NCHW4) { | |||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} else if (format == param::ConvBias::Format::NHWCD4) { | |||
share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||
bias[3] == 1); | |||
} else { | |||
megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||
share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||
bias[3] == 1); | |||
} | |||
return share_in_channel; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -158,6 +158,11 @@ R"__usage__( | |||
R"__usage__( | |||
--fast-run-algo-policy <path> | |||
It will read the cache file before profile, and save new fastrun in cache file. | |||
--fast-run-shared-batch-size | |||
Set the batch size used during fastrun, Note that it may not be the same as the actual running batch size | |||
--binary-equal-between-batch | |||
Each batch of output is promised binary equal if each batch of input is binary equal. | |||
Note that if this option is turned on, `--reproducible` will also be turned on. | |||
--reproducible | |||
Enable choose algo which is reproducible. It mainly used for cudnn algos. | |||
See https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#reproducibility | |||
@@ -1356,6 +1361,20 @@ Args Args::from_argv(int argc, char **argv) { | |||
ret.fast_run_cache_path = argv[i]; | |||
continue; | |||
} | |||
if (!strcmp(argv[i], "--fast-run-shared-batch-size")) { | |||
++i; | |||
mgb_assert(i < argc, | |||
"value not given for --fast-run-shared-batch-size"); | |||
int32_t batch_size = std::stoi(argv[i]); | |||
mgb_assert(batch_size >= 0); | |||
graph_opt.fast_run_config.shared_batch_size = batch_size; | |||
continue; | |||
} | |||
if (!strcmp(argv[i], "--binary-equal-between-batch")) { | |||
graph_opt.fast_run_config.binary_equal_between_batch = true; | |||
ret.reproducible = true; | |||
continue; | |||
} | |||
if (!strcmp(argv[i], "--reproducible")) { | |||
ret.reproducible = true; | |||
continue; | |||
@@ -1452,6 +1471,14 @@ Args Args::from_argv(int argc, char **argv) { | |||
return ret; | |||
} | |||
#if MGB_ENABLE_FASTRUN | |||
if (graph_opt.fast_run_config.shared_batch_size) { | |||
mgb_assert(ret.use_fast_run || ret.use_full_run || | |||
!ret.fast_run_cache_path.empty(), | |||
"--fast-run-shared-batch-size should be used with " | |||
"--fast-run/--full-run/--fast-run-algo-policy"); | |||
} | |||
#endif | |||
return ret; | |||
} | |||
@@ -502,7 +502,28 @@ class ComputingGraph : public std::enable_shared_from_this<ComputingGraph>, | |||
//! contains any user data associated with this graph | |||
UserDataContainer user_data; | |||
}; // Options | |||
//! Control parameter for fast run | |||
struct FastRunConfig { | |||
/*! | |||
* the batch size used by fastrun | |||
* | |||
* Non-zero value means that fastrun use this batch size | |||
* regardless of the batch size of the model | |||
* | |||
* Zero means fastrun use batch size of the model | |||
*/ | |||
uint32_t shared_batch_size = 0; | |||
/*! | |||
* \brief if the content of each input batch is binary equal, | |||
* whether the content of each output batch is promised to be | |||
* equal | |||
*/ | |||
bool binary_equal_between_batch = false; | |||
} fast_run_config; | |||
}; // Options | |||
Options& options() { | |||
return m_options; | |||
@@ -60,21 +60,31 @@ std::string profile_name(Opr* opr) { | |||
template <typename Opr> | |||
std::string format_fixlayouts( | |||
const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts, | |||
size_t arity_in, size_t arity_out) { | |||
size_t arity_in, size_t arity_out, | |||
const std::string& delimiter = " -> ") { | |||
std::string ret; | |||
ret.append(": tensor layouts("); | |||
for (size_t i = 0; i < arity_in; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
if (arity_in) { | |||
ret.append("("); | |||
for (size_t i = 0; i < arity_in; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i].to_string() + " "); | |||
} | |||
ret.append(layouts[i].to_string() + " "); | |||
ret.append(")"); | |||
} | |||
if (arity_in && arity_out) { | |||
ret.append(delimiter); | |||
} | |||
ret.append(") -> ("); | |||
for (size_t i = 0; i < arity_out; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
if (arity_out) { | |||
ret.append("("); | |||
for (size_t i = 0; i < arity_out; ++i) { | |||
if (i) { | |||
ret.append(", "); | |||
} | |||
ret.append(layouts[i + arity_in].to_string() + " "); | |||
} | |||
ret.append(layouts[i + arity_in].to_string() + " "); | |||
ret.append(")"); | |||
} | |||
return ret; | |||
} | |||
@@ -247,7 +257,7 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||
CircularDepsChecker& checker) { | |||
auto&& search_item = megdnn::Algorithm::SearchItem{ | |||
OprTypeFromOprTrait<Opr>::opr_type, helper.param(), | |||
to_layout_array<Opr>(helper.layouts())}; | |||
to_layout_array<Opr>(helper.fastrun_layouts())}; | |||
checker.put(search_item); | |||
std::vector<megdnn::Algorithm::SearchItem> ret; | |||
for (auto algo_info : helper.get_all_candidates()) { | |||
@@ -255,8 +265,9 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( | |||
helper.get_algorithm_from_desc(algo_info.desc); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<megdnn::Algorithm::SearchItem>&& sub_items = | |||
algo->get_subopr_list(to_layout_array<Opr>(helper.layouts()), | |||
helper.megdnn_opr()); | |||
algo->get_subopr_list( | |||
to_layout_array<Opr>(helper.fastrun_layouts()), | |||
helper.megdnn_opr()); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = | |||
@@ -323,6 +334,166 @@ static Algorithm::Info::Desc deserialize_read_pod(const std::string& data, | |||
namespace mgb { | |||
namespace opr { | |||
template <class Opr> | |||
class LayoutsModifier { | |||
using FixedTensorLayouts = typename AlgoChooser<Opr>::FixedTensorLayouts; | |||
public: | |||
static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {} | |||
private: | |||
//! index of batch in tensor, 3 for CHWN4 e.g. | |||
static size_t index_of_batch(const typename Opr::Param&) { return 0; } | |||
//! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g. | |||
static std::vector<size_t> sm_indices_contain_batch; | |||
}; | |||
template <class Opr> | |||
std::vector<size_t> LayoutsModifier<Opr>::sm_indices_contain_batch = {}; | |||
#define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ | |||
template <> \ | |||
class LayoutsModifier<opr> { \ | |||
public: \ | |||
using FixedTensorLayouts = \ | |||
typename AlgoChooser<opr>::FixedTensorLayouts; \ | |||
static void on(FixedTensorLayouts& layouts, const opr::Param& param, \ | |||
size_t new_batch_size) { \ | |||
size_t batch_index = index_of_batch(param); \ | |||
for (size_t index : sm_indices_contain_batch) { \ | |||
layouts.at(index)[batch_index] = new_batch_size; \ | |||
} \ | |||
} \ | |||
\ | |||
private: \ | |||
static size_t index_of_batch(const opr::Param&) { return 0; } \ | |||
static std::vector<size_t> sm_indices_contain_batch; \ | |||
}; \ | |||
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs; | |||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DForward, | |||
(std::initializer_list<size_t>{0, 2})) | |||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardData, | |||
(std::initializer_list<size_t>{1, 2})) | |||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardFilter, | |||
(std::initializer_list<size_t>{0, 1})) | |||
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchedMatrixMul, | |||
(std::initializer_list<size_t>{0, 1, 2})) | |||
#undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST | |||
#define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ | |||
template <> \ | |||
class LayoutsModifier<opr> { \ | |||
public: \ | |||
using FixedTensorLayouts = \ | |||
typename AlgoChooser<opr>::FixedTensorLayouts; \ | |||
static void on(FixedTensorLayouts& layouts, const opr::Param& param, \ | |||
size_t new_batch_size) { \ | |||
size_t batch_index = index_of_batch(param); \ | |||
for (size_t index : sm_indices_contain_batch) { \ | |||
layouts.at(index)[batch_index] = new_batch_size; \ | |||
} \ | |||
} \ | |||
\ | |||
private: \ | |||
static size_t index_of_batch(const opr::Param& param) { \ | |||
if (param.format == opr::Param::Format::CHWN4) { \ | |||
return 3; \ | |||
} \ | |||
return 0; \ | |||
} \ | |||
static std::vector<size_t> sm_indices_contain_batch; \ | |||
}; \ | |||
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs; | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionForward, | |||
(std::initializer_list<size_t>{0, 2})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardData, | |||
(std::initializer_list<size_t>{1, 2})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardFilter, | |||
(std::initializer_list<size_t>{0, 1})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareForward, | |||
(std::initializer_list<size_t>{0, 2})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardData, | |||
(std::initializer_list<size_t>{1, 2})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardFilter, | |||
(std::initializer_list<size_t>{0, 1})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvForward, | |||
(std::initializer_list<size_t>{0, 2, 3, | |||
4})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardData, | |||
(std::initializer_list<size_t>{0, 2, 3, 4, | |||
5, 6, 7})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardFilter, | |||
(std::initializer_list<size_t>{0, 1, 2, | |||
3})) | |||
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchConvBiasForward, | |||
(std::initializer_list<size_t>{0, 1, 2, 3, | |||
4})) | |||
#undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST | |||
template <> | |||
class LayoutsModifier<megdnn::ConvBiasForward> { | |||
public: | |||
using FixedTensorLayouts = | |||
typename AlgoChooser<megdnn::ConvBiasForward>::FixedTensorLayouts; | |||
static void on(FixedTensorLayouts& layouts, | |||
const megdnn::ConvBiasForward::Param& param, | |||
size_t new_batch_size) { | |||
size_t batch_index = index_of_batch(param); | |||
for (size_t index : sm_indices_contain_batch) { | |||
layouts.at(index)[batch_index] = new_batch_size; | |||
} | |||
for (size_t index : sm_indices_contain_batch_broadcast) { | |||
if (!check_bias_share_in_channel(layouts.at(index), param.format)) { | |||
layouts.at(index)[batch_index] = new_batch_size; | |||
} | |||
} | |||
} | |||
private: | |||
static std::vector<size_t> sm_indices_contain_batch; | |||
static std::vector<size_t> sm_indices_contain_batch_broadcast; | |||
static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) { | |||
if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) { | |||
return 3; | |||
} | |||
return 0; | |||
} | |||
}; | |||
std::vector<size_t> | |||
LayoutsModifier<megdnn::ConvBiasForward>::sm_indices_contain_batch = { | |||
0, 3, 4}; | |||
std::vector<size_t> LayoutsModifier< | |||
megdnn::ConvBiasForward>::sm_indices_contain_batch_broadcast = {2}; | |||
template <> | |||
class LayoutsModifier<megdnn::MatrixMul> { | |||
public: | |||
using FixedTensorLayouts= | |||
typename AlgoChooser<megdnn::MatrixMul>::FixedTensorLayouts; | |||
static void on(FixedTensorLayouts& layouts, | |||
const megdnn::MatrixMul::Param& param, | |||
size_t new_batch_size) { | |||
//! Because we do not know whether the batch size is in the dimension m | |||
//! or the dimension n, we just ignore both m and n here. | |||
// FIXME Find a way to make mgb obtain batch size information from R or | |||
// automatically | |||
layouts.at(2)[0] = new_batch_size; | |||
layouts.at(2)[1] = new_batch_size; | |||
if (param.transposeA) { | |||
layouts.at(0)[1] = new_batch_size; | |||
} else { | |||
layouts.at(0)[0] = new_batch_size; | |||
} | |||
if (param.transposeB) { | |||
layouts.at(1)[0] = new_batch_size; | |||
} else { | |||
layouts.at(1)[1] = new_batch_size; | |||
} | |||
} | |||
}; | |||
///////////////////////////// AlgoChooserHelper ////////////////////////// | |||
template <typename Opr> | |||
AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||
@@ -331,14 +502,25 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||
const CompNode& cn, | |||
const megdnn::param::ExecutionPolicy& execution_policy, | |||
bool allow_weight_preprocess) | |||
: m_layouts{layouts}, | |||
: m_fastrun_layouts{layouts}, | |||
m_incache_layouts{layouts}, | |||
m_dnn_opr{megdnn_opr}, | |||
m_param{param_str}, | |||
m_base_mgb_opr{mgb_opr}, | |||
m_cn{cn}, | |||
m_execution_policy{execution_policy}, | |||
m_allow_weight_preprocess{allow_weight_preprocess} { | |||
mgb_assert(m_layouts.size() == layouts.size()); | |||
auto fastrun_batch_size = | |||
owner_graph()->options().fast_run_config.shared_batch_size; | |||
if (fastrun_batch_size) { | |||
LayoutsModifier<Opr>::on(m_incache_layouts, m_dnn_opr->param(), 0); | |||
LayoutsModifier<Opr>::on(m_fastrun_layouts, m_dnn_opr->param(), | |||
fastrun_batch_size); | |||
} | |||
mgb_assert(m_fastrun_layouts.size() == layouts.size()); | |||
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | |||
std::tuple_size<FixedTensorLayouts>::value == 5 || | |||
std::tuple_size<FixedTensorLayouts>::value == 8, | |||
@@ -358,13 +540,13 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_heuristic( | |||
policy.algo = | |||
APPLY(m_dnn_opr->get_algorithm_info_heuristic( | |||
args..., workspace_limit, attr.first, attr.second), | |||
m_layouts) | |||
m_fastrun_layouts) | |||
.desc; | |||
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||
to_layout_array<Opr>(m_layouts), m_dnn_opr); | |||
to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||
@@ -393,7 +575,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_profile( | |||
if (policy.algo.valid()) { | |||
return policy; | |||
} | |||
if (!algo_usable_on_shape_change<Opr>()) { | |||
if (is_matmul<Opr>()) { | |||
mgb_log_warn( | |||
"choose algo by heuristic, which may cause performance " | |||
"regression."); | |||
@@ -442,7 +624,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache( | |||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
typename Opr::Param origin_param = m_dnn_opr->param(); | |||
AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(), | |||
AlgoChooserProfileCache::Key cache_key{m_incache_layouts.data(), | |||
m_incache_layouts.size(), | |||
&origin_param, sizeof(origin_param)}; | |||
auto&& rst = cache.get(cache_key); | |||
if (!rst.valid()) | |||
@@ -472,21 +655,21 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache( | |||
} | |||
std::string layouts_str = | |||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out); | |||
format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out); | |||
if (skip_by_negative) { | |||
mgb_log_error( | |||
"opr: %s, layouts: %s, No usable algo. There are available algos match " | |||
"opr: %s, layouts: %s, No usable algo. There are available " | |||
"algos match " | |||
"positive strategy(%s), but filtered by negative stategy(%s).", | |||
m_base_mgb_opr->dyn_typeinfo()->name, | |||
layouts_str.c_str(), | |||
m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), | |||
Algorithm::attribute_str(target_attr.first).c_str(), | |||
Algorithm::attribute_str(target_attr.second).c_str()); | |||
} else { | |||
mgb_log_error( | |||
"opr: %s, layouts: %s, No usable algo. algos read from cache could not " | |||
"opr: %s, layouts: %s, No usable algo. algos read from cache " | |||
"could not " | |||
"satisfy positive strategy(%s)", | |||
m_base_mgb_opr->dyn_typeinfo()->name, | |||
layouts_str.c_str(), | |||
m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), | |||
Algorithm::attribute_str(target_attr.first).c_str()); | |||
} | |||
@@ -508,7 +691,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
auto target_attr = | |||
extract_algo_attribute(selected_strategy); | |||
std::string layouts_str = format_fixlayouts<Opr>( | |||
m_layouts, arity_in, arity_out); | |||
m_fastrun_layouts, arity_in, arity_out); | |||
std::string msg = ssprintf( | |||
"(opr : %s, layouts %s, with attribute(%s) and " | |||
"without attribute(%s)", | |||
@@ -535,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic( | |||
args..., workspace_limit, attr.first, | |||
attr.second), | |||
m_layouts) | |||
m_fastrun_layouts) | |||
.desc; | |||
mgb_assert(policy.algo.valid(), | |||
"No algo found from heuristic with strategy %u and " | |||
@@ -548,7 +731,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(algo, "Unknown algo description"); | |||
std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list( | |||
to_layout_array<Opr>(m_layouts), m_dnn_opr); | |||
to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr); | |||
FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); | |||
@@ -575,26 +758,32 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
template <typename Opr> | |||
size_t AlgoChooser<Opr>::AlgoChooserHelper::get_workspace_size_bytes( | |||
const ImplExecutionPolicy& policy) const { | |||
const ImplExecutionPolicy& policy, | |||
const FixedTensorLayouts& layouts) const { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes"))) | |||
m_dnn_opr->execution_policy() = policy; | |||
size_t result; | |||
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; | |||
if (layouts.at(0).ndim) { | |||
layouts_ptr = &layouts; | |||
} | |||
if_constexpr<opr_supports_preprocess<Opr>()>( | |||
[&](auto _) { | |||
auto&& opr = _(m_dnn_opr); | |||
auto prep = this->construct_fake_preprocess_filter(); | |||
auto prep = | |||
this->construct_fake_preprocess_filter(*layouts_ptr); | |||
PreprocessFilter<Opr>* prep_ptr = | |||
prep.valid() ? &prep.val() : nullptr; | |||
result = std::max( | |||
APPLY(opr->get_preprocess_workspace_in_bytes(args...), | |||
m_layouts), | |||
*layouts_ptr), | |||
APPLY(opr->get_workspace_in_bytes(args..., prep_ptr), | |||
m_layouts)); | |||
*layouts_ptr)); | |||
}, | |||
/* else */ | |||
[&](auto _) { | |||
result = APPLY(_(m_dnn_opr)->get_workspace_in_bytes(args...), | |||
m_layouts); | |||
*layouts_ptr); | |||
}); | |||
return result; | |||
MIDOUT_E | |||
@@ -605,8 +794,8 @@ std::vector<typename AlgoChooser<Opr>::ImplAlgo> | |||
AlgoChooser<Opr>::AlgoChooserHelper::get_all_candidates() const { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates"))) | |||
auto heu = choose_by_heuristic(m_execution_policy.strategy); | |||
auto&& ret = | |||
APPLY(m_dnn_opr->get_all_algorithms_info(args...), m_layouts); | |||
auto&& ret = APPLY(m_dnn_opr->get_all_algorithms_info(args...), | |||
m_fastrun_layouts); | |||
bool found = false; | |||
for (size_t i = 0; i < ret.size(); ++i) { | |||
if (ret[i].desc == heu.algo) { | |||
@@ -637,7 +826,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo( | |||
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy); | |||
param.workspace = get_workspace_size_bytes(policy); | |||
for (int i = 0; i < arity; ++i) { | |||
auto&& src = m_layouts[i]; | |||
auto&& src = m_fastrun_layouts[i]; | |||
bool cond_normal = src.format.is_default() && | |||
(src.dtype.category() == DTypeCategory::FLOAT || | |||
src.dtype.category() == DTypeCategory::INT || | |||
@@ -655,9 +844,9 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo( | |||
param.dtypes[i] = src.dtype.enumv(); | |||
} | |||
param.comp_node_loc = m_cn.locator(); | |||
mgb_assert(param.shapes.size() == m_layouts.size()); | |||
mgb_assert(param.shapes.size() == m_fastrun_layouts.size()); | |||
for (size_t i = 0; i < param.shapes.size(); ++i) | |||
param.shapes[i] = m_layouts[i]; | |||
param.shapes[i] = m_fastrun_layouts[i]; | |||
param.opr_param = m_dnn_opr->param(); | |||
param.allow_weight_preprocess = m_allow_weight_preprocess; | |||
@@ -692,7 +881,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
auto target_attr = extract_algo_attribute(selected_strategy); | |||
std::string layouts_str = | |||
format_fixlayouts<Opr>(m_layouts, arity_in, arity_out); | |||
format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out); | |||
double cur_timeout = 0; | |||
auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||
@@ -761,10 +950,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
workspace_limit); | |||
mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); | |||
FixedTensorLayouts origin_layouts = m_layouts; | |||
FixedTensorLayouts incache_layouts = m_incache_layouts; | |||
typename Opr::Param origin_param = m_dnn_opr->param(); | |||
AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||
origin_layouts.size(), &origin_param, | |||
AlgoChooserProfileCache::Key cache_key{incache_layouts.data(), | |||
incache_layouts.size(), &origin_param, | |||
sizeof(origin_param)}; | |||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
@@ -774,15 +963,20 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
template <typename Opr> | |||
Maybe<PreprocessFilter<Opr>> | |||
AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter() const { | |||
AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter( | |||
const FixedTensorLayouts& layouts) const { | |||
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter"))) | |||
Maybe<PreprocessFilter<Opr>> result = None; | |||
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; | |||
if (layouts.at(0).ndim) { | |||
layouts_ptr = &layouts; | |||
} | |||
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) { | |||
if (!m_allow_weight_preprocess) | |||
return; | |||
auto opr = _(m_dnn_opr); | |||
auto layouts = APPLY(opr->deduce_preprocessed_filter_layout(args...), | |||
m_layouts); | |||
*layouts_ptr); | |||
//! No preprocess layout means no need weight preprocess | |||
if (layouts.empty()) { | |||
return; | |||
@@ -825,6 +1019,16 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute( | |||
ret.second |= AlgoAttribute::NAIVE; | |||
} | |||
//! from graph option | |||
if (owner_graph()->options().fast_run_config.shared_batch_size) { | |||
ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE; | |||
} | |||
if (owner_graph()->options().fast_run_config.binary_equal_between_batch) { | |||
ret.first |= AlgoAttribute::REPRODUCIBLE; | |||
ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
} | |||
return ret; | |||
} | |||
@@ -854,7 +1058,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute( | |||
template size_t \ | |||
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_workspace_size_bytes( \ | |||
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
policy) const; \ | |||
policy, \ | |||
const FixedTensorLayouts& layouts) const; \ | |||
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \ | |||
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_all_candidates() const; \ | |||
template Maybe<AlgoChooserProfileCache::ResultEntry> \ | |||
@@ -942,10 +1147,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||
if (!policy.algo.valid()) { | |||
policy = get_policy(helper); | |||
} | |||
size_t workspace = helper.get_workspace_size_bytes(policy); | |||
size_t workspace = helper.get_workspace_size_bytes(policy, layouts); | |||
std::string ret; | |||
ret.append(mgb_opr->dyn_typeinfo()->name); | |||
ret.append(": tensor layouts"); | |||
ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out); | |||
Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
mgb_assert(palgo, "Unknown algo description"); | |||
@@ -68,7 +68,10 @@ class AlgoChooser { | |||
public: | |||
using FixedTensorLayouts = std::array<TensorLayout, arity>; | |||
class AlgoChooserHelper { | |||
FixedTensorLayouts m_layouts; | |||
//! fastrun layouts | |||
FixedTensorLayouts m_fastrun_layouts; | |||
//! layouts used when get and set cache item | |||
FixedTensorLayouts m_incache_layouts; | |||
Opr* m_dnn_opr; | |||
std::string m_param; | |||
const cg::OperatorNodeBase* m_base_mgb_opr; | |||
@@ -89,7 +92,7 @@ public: | |||
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | |||
const TensorLayout& inp_layout(size_t idx) const { | |||
return m_layouts[idx]; | |||
return m_fastrun_layouts[idx]; | |||
} | |||
cg::ComputingGraph* owner_graph() const { | |||
return m_base_mgb_opr->owner_graph(); | |||
@@ -109,7 +112,13 @@ public: | |||
return m_dnn_opr->get_algorithm_from_desc(desc); | |||
} | |||
const FixedTensorLayouts& layouts() const { return m_layouts; } | |||
const FixedTensorLayouts& fastrun_layouts() const { | |||
return m_fastrun_layouts; | |||
} | |||
const FixedTensorLayouts& incache_layouts() const { | |||
return m_incache_layouts; | |||
} | |||
//! construct algo chain by heuristic | |||
ImplExecutionPolicy choose_by_heuristic( | |||
@@ -141,7 +150,8 @@ public: | |||
//! get workspace size required for specific execution policy | |||
size_t get_workspace_size_bytes( | |||
const ImplExecutionPolicy& policy) const; | |||
const ImplExecutionPolicy& policy, | |||
const FixedTensorLayouts& layouts = {}) const; | |||
//! get all candidate algos, and the one choose_by_heuristic() is | |||
//! put first | |||
@@ -173,7 +183,8 @@ public: | |||
const ExecutionStrategy& strategy) const; | |||
private: | |||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | |||
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter( | |||
const FixedTensorLayouts& layouts = {}) const; | |||
}; | |||
template <typename U> | |||
@@ -54,11 +54,11 @@ constexpr bool opr_contain_bias() { | |||
return std::is_same<Opr, megdnn::ConvBias>::value; | |||
} | |||
//! matmul and batchedMatrixMul may not be usable once shape changed | |||
//! matmul and batchedMatrixMul | |||
template <typename Opr> | |||
constexpr bool algo_usable_on_shape_change() { | |||
return !(std::is_same<Opr, megdnn::MatrixMul>::value || | |||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value); | |||
constexpr bool is_matmul() { | |||
return std::is_same<Opr, megdnn::MatrixMul>::value || | |||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value; | |||
} | |||
template <typename Opr, bool has_prep> | |||
@@ -0,0 +1,304 @@ | |||
/** | |||
* \file src/opr/test/algo_chooser.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/opr/blas.h" | |||
#include "megbrain/opr/dnn/convolution.h" | |||
#include "megbrain/test/autocheck.h" | |||
#include "megbrain/test/helper.h" | |||
#include "megbrain/test/megdnn_helper.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/opr/basic_arith.h" | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megdnn/oprs/base.h" | |||
#include "megdnn/dtype.h" | |||
#include <cmath> | |||
#include <random> | |||
#include <utility> | |||
using namespace mgb; | |||
namespace { | |||
#if MGB_CUDA | |||
#if MGB_ENABLE_FASTRUN | |||
template <typename MgbOpr, int arith> | |||
struct GraphMaker; | |||
template <typename MgbOpr> | |||
struct GraphMaker<MgbOpr, 2> { | |||
SymbolVar operator()(const std::array<cg::SymbolVar, 2>& inputs, | |||
typename MgbOpr::Param& param, | |||
typename MgbOpr::ExecutionPolicy& policy) { | |||
return MgbOpr::make(inputs[0], inputs[1], param, policy); | |||
} | |||
}; | |||
template <> | |||
struct GraphMaker<opr::ConvolutionBackwardData, 2> { | |||
SymbolVar operator()( | |||
const std::array<cg::SymbolVar, 2>& inputs, | |||
opr::ConvolutionBackwardData::Param& param, | |||
opr::ConvolutionBackwardData::ExecutionPolicy& policy) { | |||
return opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1], | |||
param, policy); | |||
} | |||
}; | |||
template <> | |||
struct GraphMaker<opr::Convolution3DBackwardData, 2> { | |||
SymbolVar operator()( | |||
const std::array<cg::SymbolVar, 2>& inputs, | |||
opr::Convolution3DBackwardData::Param& param, | |||
opr::Convolution3DBackwardData::ExecutionPolicy& policy) { | |||
return opr::Convolution3DBackwardData::make_deconv(inputs[0], inputs[1], | |||
param, policy); | |||
} | |||
}; | |||
template <typename MgbOpr> | |||
struct GraphMaker<MgbOpr, 3> { | |||
SymbolVar operator()(const std::array<cg::SymbolVar, 3>& inputs, | |||
typename MgbOpr::Param& param, | |||
typename MgbOpr::ExecutionPolicy& policy) { | |||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], param, policy, {}); | |||
} | |||
}; | |||
template <typename MgbOpr> | |||
struct GraphMaker<MgbOpr, 4> { | |||
SymbolVar operator()(const std::array<cg::SymbolVar, 4>& inputs, | |||
typename MgbOpr::Param& param, | |||
typename MgbOpr::ExecutionPolicy& policy) { | |||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, | |||
policy, {}); | |||
} | |||
}; | |||
template <typename MgbOpr> | |||
struct GraphMaker<MgbOpr, 5> { | |||
SymbolVar operator()(const std::array<cg::SymbolVar, 5>& inputs, | |||
typename MgbOpr::Param& param, | |||
typename MgbOpr::ExecutionPolicy& policy) { | |||
return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], | |||
inputs[4], param, policy, {}); | |||
} | |||
}; | |||
template <typename MgbOpr, int arith, typename dtype = dtype::Float32> | |||
void test_fastrun_opr(std::array<TensorShape, arith> inps0, | |||
std::array<TensorShape, arith> inps1, | |||
size_t expect_nr_cache_set_inp0 = 0, | |||
size_t expect_nr_cache_set_inp1 = 0, | |||
typename MgbOpr::Param param = {}) { | |||
using Policy = opr::Convolution::ExecutionPolicy; | |||
using S = Policy::Strategy; | |||
using InputGenerator = std::function<void(HostTensorND & dest)>; | |||
using ShapeInpArray = std::array<TensorShape, arith>; | |||
using CacheMem = std::pair<const void*, size_t>; | |||
auto on_get = [](const std::string&, const void*, size_t, const void*, | |||
size_t) {}; | |||
std::vector<std::pair<CacheMem, CacheMem>> cache_set_history; | |||
auto on_set = [&cache_set_history](const std::string&, const void* key, | |||
size_t key_size, const void* val, | |||
size_t val_size) { | |||
cache_set_history.emplace_back(std::make_pair(key, key_size), | |||
std::make_pair(val, val_size)); | |||
}; | |||
PersistentCacheHook cache_hook{on_get, on_set}; | |||
CompNode comp_node = CompNode::load("xpu0"); | |||
GraphMaker<MgbOpr, arith> graph_maker; | |||
auto run = [¶m, &comp_node, &graph_maker]( | |||
const std::shared_ptr<cg::ComputingGraph>& graph, | |||
const ShapeInpArray& shapes) { | |||
std::array<InputGenerator, arith> inputs_generator; | |||
std::array<std::shared_ptr<HostTensorND>, arith> inputs; | |||
for (size_t i = 0; i < arith; ++i) { | |||
inputs[i] = std::make_shared<HostTensorND>(comp_node, | |||
dtype()); | |||
} | |||
HostTensorGenerator<dtype> gen_host; | |||
for (size_t i = 0; i < arith; ++i) { | |||
inputs[i]->resize(shapes[i]); | |||
*inputs[i] = *gen_host(inputs[i]->shape(), comp_node); | |||
mgb_assert(inputs[i]->shape().eq_shape(shapes[i])); | |||
} | |||
std::array<cg::SymbolVar, arith> sym_in; | |||
for (size_t i = 0; i < arith; ++i) { | |||
// to trigger graph trans | |||
sym_in[i] = opr::Host2DeviceCopy::make(*graph, inputs[i], | |||
ssprintf("inp%zu", i)); | |||
} | |||
Policy policy; | |||
policy.strategy = S::PROFILE; | |||
auto out = graph_maker(sym_in, param, policy); | |||
std::unique_ptr<cg::AsyncExecutable> func = | |||
graph->compile({{out, {}}}); | |||
func->execute(); | |||
}; | |||
std::shared_ptr<cg::ComputingGraph> fastrun_ignore_batchsize_graph = | |||
ComputingGraph::make(); | |||
fastrun_ignore_batchsize_graph->options() | |||
.fast_run_config.shared_batch_size = 20; | |||
run(fastrun_ignore_batchsize_graph, inps0); | |||
size_t nr_set_inp0 = cache_set_history.size(); | |||
if (expect_nr_cache_set_inp0) { | |||
ASSERT_EQ(cache_set_history.size(), expect_nr_cache_set_inp0); | |||
} | |||
run(fastrun_ignore_batchsize_graph, inps1); | |||
size_t nr_set_total = expect_nr_cache_set_inp1 + nr_set_inp0; | |||
ASSERT_EQ(cache_set_history.size(), nr_set_total); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution) { | |||
REQUIRE_GPU(1); | |||
test_fastrun_opr<opr::Convolution, 2>( | |||
{TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}}, | |||
{TensorShape{1, 3, 36, 36}, TensorShape{4, 3, 3, 3}}); | |||
test_fastrun_opr<opr::ConvolutionBackwardData, 2>( | |||
{TensorShape{12, 4, 23, 29}, TensorShape{4, 5, 3, 2}}, | |||
{TensorShape{2, 4, 23, 29}, TensorShape{4, 5, 3, 2}}); | |||
test_fastrun_opr<opr::ConvolutionBackwardFilter, 3>( | |||
{TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28}, | |||
TensorShape{5, 4, 3, 2}}, | |||
{TensorShape{2, 4, 23, 29}, TensorShape{2, 5, 21, 28}, | |||
TensorShape{5, 4, 3, 2}}); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvBias) { | |||
REQUIRE_GPU(1); | |||
test_fastrun_opr<opr::ConvBias, 3>( | |||
{TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3}, | |||
TensorShape{1, 24, 1, 1}}, | |||
{TensorShape{1, 16, 50, 50}, TensorShape{24, 16, 3, 3}, | |||
TensorShape{1, 24, 1, 1}}); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution3D) { | |||
REQUIRE_GPU(1); | |||
test_fastrun_opr<opr::Convolution3D, 2>( | |||
{TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}}, | |||
{TensorShape{3, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}}); | |||
test_fastrun_opr<opr::Convolution3DBackwardData, 2>( | |||
{TensorShape{14, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}}, | |||
{TensorShape{4, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}}); | |||
test_fastrun_opr<opr::Convolution3DBackwardFilter, 3>( | |||
{TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18}, | |||
TensorShape{16, 16, 1, 1, 1}}, | |||
{TensorShape{4, 16, 18, 18, 18}, TensorShape{4, 16, 18, 18, 18}, | |||
TensorShape{16, 16, 1, 1, 1}}); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeLocalShare) { | |||
REQUIRE_GPU(1); | |||
opr::LocalShare::Param local_share_param; | |||
local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION; | |||
local_share_param.pad_h = local_share_param.pad_w = 1; | |||
local_share_param.stride_h = local_share_param.stride_w = 1; | |||
local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2; | |||
test_fastrun_opr<opr::LocalShareForward, 2>( | |||
{TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, | |||
{TensorShape{3, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, 0, 0, | |||
local_share_param); | |||
test_fastrun_opr<opr::LocalShareBackwardData, 3>( | |||
{TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24}, | |||
TensorShape{32, 128, 24, 24}}, | |||
{TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{2, 128, 24, 24}, | |||
TensorShape{2, 128, 24, 24}}); | |||
test_fastrun_opr<opr::LocalShareBackwardFilter, 3>( | |||
{TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35}, | |||
TensorShape{3, 3, 3, 3, 3, 4}}, | |||
{TensorShape{4, 3, 36, 36}, TensorShape{4, 4, 35, 35}, | |||
TensorShape{3, 3, 3, 3, 3, 4}}); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeDeformableConv) { | |||
REQUIRE_GPU(1); | |||
test_fastrun_opr<opr::DeformableConvForward, 4>( | |||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}}, | |||
{TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||
TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}}); | |||
test_fastrun_opr<opr::DeformableConvBackwardData, 5>( | |||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}, | |||
TensorShape{12, 6, 18, 18}}, | |||
{TensorShape{4, 6, 20, 20}, | |||
TensorShape{6, 6, 3, 3}, | |||
TensorShape{4, 18, 18, 18}, | |||
TensorShape{4, 9, 18, 18}, | |||
TensorShape{4, 6, 18, 18}}); | |||
test_fastrun_opr<opr::DeformableConvBackwardFilter, 5>( | |||
{TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||
TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}, | |||
TensorShape{12, 6, 18, 18}}, | |||
{TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3}, | |||
TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}, | |||
TensorShape{4, 6, 18, 18}}); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeMatrixMul) { | |||
REQUIRE_GPU(1); | |||
//! fastrun_shared_batch_size == 20 | |||
//! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin | |||
//! {12(10), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA | |||
//! {12(10), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB | |||
//! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB | |||
//! | |||
//! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin duplicate | |||
//! {12(4), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA | |||
//! {12(4), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB | |||
//! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB duplicate | |||
test_fastrun_opr<opr::MatrixMul, 2>( | |||
{TensorShape{10, 12}, TensorShape{12, 12}}, | |||
{TensorShape{4, 12}, TensorShape{12, 12}}, 4, 2); | |||
} | |||
TEST(TestOprDNN, FastrunIgnoreBatchSizeBatchedMatrixMul) { | |||
REQUIRE_GPU(1); | |||
//! fastrun_shared_batch_size == 20 | |||
//! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin | |||
//! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA | |||
//! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB | |||
//! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB | |||
//! | |||
//! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin duplicate | |||
//! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA duplicate | |||
//! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB duplicate | |||
//! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB duplicate | |||
test_fastrun_opr<opr::BatchedMatrixMul, 2>( | |||
{TensorShape{12, 6, 8}, TensorShape{12, 8, 4}}, | |||
{TensorShape{4, 6, 8}, TensorShape{4, 8, 4}}); | |||
} | |||
#endif // MGB_ENABLE_FASTRUN | |||
#endif // MGB_CUDA | |||
} // anonymous namespace | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -460,12 +460,13 @@ mgb::make_callback_copy(SymbolVar dev, HostTensorND &host, bool sync) { | |||
/* ========================== PersistentCacheHook ========================== */ | |||
class PersistentCacheHook::HookedImpl final : public PersistentCache { | |||
GetHook m_on_get; | |||
Hook m_on_get, m_on_set; | |||
public: | |||
std::shared_ptr<PersistentCache> orig_impl; | |||
HookedImpl(GetHook on_get) : m_on_get{std::move(on_get)} {} | |||
HookedImpl(Hook on_get, Hook on_set) | |||
: m_on_get{std::move(on_get)}, m_on_set{std::move(on_set)} {} | |||
Maybe<Blob> get(const std::string& category, const Blob& key) override { | |||
auto ret = orig_impl->get(category, key); | |||
@@ -476,12 +477,18 @@ public: | |||
void put(const std::string& category, const Blob& key, | |||
const Blob& value) override { | |||
m_on_set(category, key.ptr, key.size, value.ptr, | |||
value.size); | |||
orig_impl->put(category, key, value); | |||
} | |||
}; | |||
PersistentCacheHook::PersistentCacheHook(GetHook on_get) | |||
: m_impl{std::make_shared<HookedImpl>(std::move(on_get))} { | |||
PersistentCacheHook::Hook PersistentCacheHook::default_set_hook = | |||
[](const std::string&, const void*, size_t, const void*, size_t) {}; | |||
PersistentCacheHook::PersistentCacheHook(Hook on_get, Hook on_set) | |||
: m_impl{std::make_shared<HookedImpl>(std::move(on_get), | |||
std::move(on_set))} { | |||
m_impl->orig_impl = PersistentCache::set_impl(m_impl); | |||
} | |||
@@ -512,17 +512,17 @@ bool check_device_type_avaiable(CompNode::DeviceType device_type); | |||
//! hook persistent cache get calls during the lifetime | |||
class PersistentCacheHook { | |||
class HookedImpl; | |||
std::shared_ptr<HookedImpl> m_impl; | |||
public: | |||
//! if value is not available, \p val and \p val_size would be zero | |||
using GetHook = thin_function<void(const std::string& category, | |||
const void* key, size_t key_size, | |||
const void* val, size_t val_size)>; | |||
PersistentCacheHook(GetHook on_get); | |||
using Hook = thin_function<void(const std::string& category, | |||
const void* key, size_t key_size, | |||
const void* val, size_t val_size)>; | |||
PersistentCacheHook(Hook on_get, Hook on_set = default_set_hook); | |||
~PersistentCacheHook(); | |||
private: | |||
static Hook default_set_hook; | |||
class HookedImpl; | |||
std::shared_ptr<HookedImpl> m_impl; | |||
}; | |||
//! skip a testcase if xpu not available | |||
#define REQUIRE_XPU(n) do { \ | |||