GitOrigin-RevId: ca860f487e
release-0.6
@@ -234,10 +234,10 @@ public: | |||
const TensorLayout& dst) = 0; | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
size_t workspace_in_bytes); | |||
CanonizedFilterMeta check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_in_bytes, | |||
const PreprocessedFilter* preprocessed_filter); | |||
}; | |||
using Convolution = ConvolutionForward; | |||
@@ -408,12 +408,11 @@ public: | |||
static WinogradParam parse_winograd_name(const std::string& algo_name); | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst, | |||
size_t workspace_in_bytes); | |||
CanonizedFilterMeta check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, size_t workspace_in_bytes, | |||
const PreprocessedFilter* preprocessed_filter); | |||
}; | |||
using ConvBias = ConvBiasForward; | |||
@@ -32,7 +32,8 @@ void ConvBiasForward::deduce_layout(const TensorLayout& src, | |||
ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, size_t workspace_in_bytes) { | |||
const TensorLayout& dst, size_t workspace_in_bytes, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | |||
param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | |||
@@ -82,9 +83,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
auto ret = check_layout_fwd(src, filter, dst); | |||
megdnn_assert_contiguous(bias); | |||
auto required_workspace_in_bytes = | |||
get_workspace_in_bytes(src, filter, bias, z, dst, nullptr); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
auto required_workspace_in_bytes = get_workspace_in_bytes( | |||
src, filter, bias, z, dst, preprocessed_filter); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes, | |||
"worksapce have size of %zu, but need %zu", | |||
workspace_in_bytes, required_workspace_in_bytes); | |||
if (bias.ndim != 0) { | |||
//! bias.layout == dst.layout failed, no assert information | |||
auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { | |||
@@ -1028,10 +1028,11 @@ void ConvolutionForward::deduce_layout(const TensorLayout& src, | |||
ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_in_bytes) { | |||
const TensorLayout& dst, size_t workspace_in_bytes, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto ret = check_layout_fwd(src, filter, dst); | |||
auto required_workspace_in_bytes = | |||
get_workspace_in_bytes(src, filter, dst, nullptr); | |||
get_workspace_in_bytes(src, filter, dst, preprocessed_filter); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
return ret; | |||
} | |||
@@ -25,10 +25,10 @@ namespace cuda { | |||
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
workspace.size); | |||
workspace.size, preprocessed_filter); | |||
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | |||
z.layout, dst.layout); | |||
@@ -52,13 +52,10 @@ public: | |||
const TensorLayout&, const TensorLayout&) override { | |||
return {}; | |||
} | |||
void exec_preprocess(const TensorLayout& , | |||
_megdnn_tensor_in , | |||
const TensorLayout& , | |||
const TensorLayout& , | |||
const TensorLayout& , | |||
PreprocessedFilter* , | |||
_megdnn_workspace ) override { | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override { | |||
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | |||
} | |||
@@ -119,17 +119,22 @@ SmallVector<ConvBiasImpl::AlgoBase*> ConvBiasImpl::algo_pack() { | |||
bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { | |||
return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | |||
} | |||
#define NCB_ALGO_FUNC(name, algo, param) \ | |||
static_cast<AlgoBase*>(algo)->name(this, param) | |||
void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
workspace.size); | |||
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace); | |||
workspace.size, preprocessed_filter); | |||
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | |||
preprocessed_filter); | |||
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
if (!is_naive_algo(algo) && | |||
ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { | |||
exec_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, | |||
@@ -137,18 +142,71 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
} | |||
} | |||
void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout, | |||
_megdnn_tensor_in filter, | |||
const TensorLayout& bias_layout, | |||
const TensorLayout& z_layout, | |||
const TensorLayout& dst_layout, | |||
PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
//! exec_preprocess currently only support preprocess weights before exec, | |||
//! src/dst/bias/z will be ignored, just set to nullptr | |||
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, | |||
bias{nullptr, bias_layout}; | |||
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | |||
preprocessed_filter); | |||
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | |||
fparam) <= workspace.size) { | |||
exec_preprocess_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvBiasForwardImpl::exec_preprocess( | |||
src_layout, filter, bias_layout, z_layout, dst_layout, | |||
preprocessed_filter, workspace); | |||
} | |||
} | |||
size_t ConvBiasImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, | |||
preprocessed_filter); | |||
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvBiasForwardImpl::get_workspace_in_bytes( | |||
src, filter, bias, z, dst, preprocessed_filter); | |||
} else { | |||
return ncb_algo_get_workspace(algo, fparam); | |||
return NCB_ALGO_FUNC(get_workspace, algo, fparam); | |||
} | |||
} | |||
size_t ConvBiasImpl::get_preprocess_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( | |||
src, filter, bias, z, dst); | |||
} else { | |||
return NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam); | |||
} | |||
} | |||
SmallVector<TensorLayout> ConvBiasImpl::deduce_preprocessed_filter_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvBiasForwardImpl::deduce_preprocessed_filter_layout( | |||
src, filter, bias, z, dst); | |||
} else { | |||
return NCB_ALGO_FUNC(deduce_preprocessed_filter_layout, algo, fparam); | |||
} | |||
} | |||
@@ -156,7 +214,7 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||
auto ret = get_all_algorithms_with_ncb(fparam); | |||
if (ret.empty()) { | |||
return naive::ConvBiasForwardImpl::get_all_algorithms(src, filter, bias, | |||
@@ -170,7 +228,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||
auto result = get_algorithm_heuristic_with_ncb( | |||
fparam, workspace_limit_in_bytes, reproducible); | |||
if (result == nullptr) { | |||
@@ -181,9 +239,25 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( | |||
return result; | |||
} | |||
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
for (auto i : get_all_algorithms_with_ncb(param)) { | |||
size_t need_workspace = NCB_ALGO_FUNC(get_workspace, i, param); | |||
if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||
this, param, AlgoSelectionStrategy::HEURISTIC, | |||
reproducible) && | |||
need_workspace <= workspace_limit_in_bytes) { | |||
return i; | |||
} | |||
} | |||
return nullptr; | |||
} | |||
ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& dst) { | |||
const TensorLayout& bias, const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto safe_u32 = [](size_t v) -> uint32_t { | |||
megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | |||
"value too large: %zu", v); | |||
@@ -258,7 +332,9 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
{src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | |||
{dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | |||
param().compute_mode, | |||
nr_threads}, | |||
nr_threads, | |||
reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>( | |||
preprocessed_filter)}, | |||
param().output_block_size, | |||
format, | |||
bias.dtype, | |||
@@ -269,10 +345,12 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
NCBKernParam ret; | |||
static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param( | |||
src.layout, filter.layout, bias.layout, dst.layout); | |||
static_cast<NCBKernSizeParam&>(ret) = | |||
make_ncb_kern_size_param(src.layout, filter.layout, bias.layout, | |||
dst.layout, preprocessed_filter); | |||
ret.src_ptr = src.raw_ptr; | |||
ret.filter_ptr = filter.raw_ptr; | |||
ret.bias_ptr = bias.raw_ptr; | |||
@@ -284,7 +362,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( | |||
void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||
ConvBiasImpl::Algorithm* algo) { | |||
auto ncb_kerns = ncb_algo_dispatch_kerns(algo, param); | |||
auto ncb_kerns = NCB_ALGO_FUNC(dispatch_kerns, algo, param); | |||
for (auto&& kernel : ncb_kerns) { | |||
auto run = [kernel, param](size_t index, size_t thread_id) { | |||
CpuNDRange ndrange_id(kernel.global_size, index); | |||
@@ -295,21 +373,17 @@ void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||
} | |||
} | |||
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
return ncb_algo_get_algorithm_heuristic(param, workspace_limit_in_bytes, | |||
reproducible); | |||
} | |||
size_t ConvBiasImpl::ncb_algo_get_workspace(Algorithm* algo, | |||
const NCBKernSizeParam& param) { | |||
return static_cast<AlgoBase*>(algo)->get_workspace(this, param); | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::ncb_algo_dispatch_kerns( | |||
Algorithm* algo, const NCBKernSizeParam& param) { | |||
return static_cast<AlgoBase*>(algo)->dispatch_kerns(this, param); | |||
void ConvBiasImpl::exec_preprocess_with_ncb_kern( | |||
const NCBKernParam& param, ConvBiasImpl::Algorithm* algo) { | |||
auto ncb_kerns = NCB_ALGO_FUNC(dispatch_preprocess_kerns, algo, param); | |||
for (auto&& kernel : ncb_kerns) { | |||
auto run = [kernel, param](size_t index, size_t thread_id) { | |||
CpuNDRange ndrange_id(kernel.global_size, index); | |||
kernel.kern(param, {thread_id, ndrange_id}); | |||
}; | |||
static_cast<naive::HandleImpl*>(handle())->dispatch_kern( | |||
run, kernel.global_size.total_size()); | |||
} | |||
} | |||
std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb( | |||
@@ -332,20 +406,6 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb( | |||
return algos; | |||
} | |||
ConvBiasImpl::Algorithm* ConvBiasImpl::ncb_algo_get_algorithm_heuristic( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
for (auto i : get_all_algorithms_with_ncb(param)) { | |||
if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||
this, param, AlgoSelectionStrategy::HEURISTIC, | |||
reproducible) && | |||
ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { | |||
return i; | |||
} | |||
} | |||
return nullptr; | |||
} | |||
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | |||
const NCBKernSizeParam& param, size_t workspace_size) { | |||
if (auto set = execution_policy().algorithm) { | |||
@@ -51,6 +51,25 @@ public: | |||
_megdnn_tensor_out dst, const PreprocessedFilter*, | |||
_megdnn_workspace workspace) override; | |||
void exec_preprocess(const TensorLayout& src_layout, | |||
_megdnn_tensor_in filter, | |||
const TensorLayout& bias_layout, | |||
const TensorLayout& z_layout, | |||
const TensorLayout& dst_layout, | |||
PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) override; | |||
size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) override; | |||
//! implemented by get_workspace_with_ncb() | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
@@ -198,6 +217,23 @@ public: | |||
virtual SmallVector<NCBKern> dispatch_kerns( | |||
ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0; | |||
virtual SmallVector<NCBKern> dispatch_preprocess_kerns( | |||
ConvBiasImpl*, const NCBKernSizeParam&) const { | |||
return {}; | |||
}; | |||
//! get the layouts of weight_prerocess dst | |||
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
ConvBiasImpl*, const NCBKernSizeParam&) const { | |||
return {}; | |||
}; | |||
//! get the workspace when weight_prerocess | |||
virtual size_t get_preprocess_workspace(ConvBiasImpl*, | |||
const NCBKernSizeParam&) const { | |||
return 0_z; | |||
}; | |||
//! Temporarily used to identify whether the matmul algorithm is | |||
//! is_preferred. | |||
virtual bool is_preferred(ConvBiasImpl*, | |||
@@ -219,40 +255,19 @@ public: | |||
virtual SmallVector<AlgoBase*> algo_pack(); | |||
protected: | |||
//! default impl calls ncb_algo_dispatch_kern() | |||
virtual void exec_with_ncb_kern(const NCBKernParam& param, | |||
ConvBiasImpl::Algorithm* algo); | |||
//! default impl calls ncb_algo_get_all_algorithms() | |||
virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||
Algorithm* algo); | |||
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | |||
const NCBKernSizeParam& param); | |||
//! default impl calls ncb_algo_get_algorithm_heuristic() | |||
virtual Algorithm* get_algorithm_heuristic_with_ncb( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible = false); | |||
/** | |||
* \brief get kernel pointer for non-contiguous batch kernel or | |||
* simply conv bias kernel. | |||
* | |||
* whether the kernel processing batch 1-group is decided by the | |||
* algo. | |||
*/ | |||
virtual SmallVector<NCBKern> ncb_algo_dispatch_kerns( | |||
Algorithm* algo, const NCBKernSizeParam& param); | |||
virtual size_t ncb_algo_get_workspace(Algorithm* algo, | |||
const NCBKernSizeParam& param); | |||
/*! | |||
* the default impl iterates over all ncb_algo_get_all_algorithms() | |||
* and return the first one whose workspace does not exceed the limit. | |||
*/ | |||
virtual Algorithm* ncb_algo_get_algorithm_heuristic( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible = false); | |||
const char* get_algorithm_set_name() const override; | |||
private: | |||
@@ -276,16 +291,16 @@ private: | |||
const NCBKernSizeParam& param, | |||
size_t workspace_size = std::numeric_limits<size_t>::max()); | |||
NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& dst); | |||
NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace); | |||
NCBKernSizeParam make_ncb_kern_size_param( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter); | |||
NCBKernParam make_ncb_kern_param( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_out dst, | |||
_megdnn_workspace workspace, | |||
const PreprocessedFilter* preprocessed_filter); | |||
}; | |||
} // namespace fallback | |||
@@ -376,7 +376,67 @@ size_t ConvolutionImpl::AlgoDefault::get_workspace( | |||
return get_bundle(param).total_size_in_bytes(); | |||
} | |||
//! Return the implment kernel | |||
size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace( | |||
ConvolutionImpl*, const NCBKernSizeParam& param) const { | |||
::ConvBiasImpl::NCBKernSizeParam conv_bias_param = | |||
init_convbias_opr_and_param(m_conv_bias_opr, param); | |||
m_conv_bias_opr->execution_policy() = {m_algorithm}; | |||
return m_algorithm->get_preprocess_workspace(m_conv_bias_opr, | |||
conv_bias_param); | |||
} | |||
SmallVector<TensorLayout> | |||
ConvolutionImpl::AlgoDefault::deduce_preprocessed_filter_layout( | |||
ConvolutionImpl*, const NCBKernSizeParam& param) const { | |||
::ConvBiasImpl::NCBKernSizeParam conv_bias_param = | |||
init_convbias_opr_and_param(m_conv_bias_opr, param); | |||
m_conv_bias_opr->execution_policy() = {m_algorithm}; | |||
return m_algorithm->deduce_preprocessed_filter_layout(m_conv_bias_opr, | |||
conv_bias_param); | |||
} | |||
//! Return the implement preprocess kernel | |||
SmallVector<ConvolutionImpl::NCBKern> | |||
ConvolutionImpl::AlgoDefault::get_preprocess_kimpl( | |||
::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | |||
const NCBKernSizeParam& param) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) { | |||
// construct the conv_bias kern param | |||
::ConvBiasImpl::NCBKernParam conv_bias_param; | |||
::ConvBiasImpl::NCBKernSizeParam conv_bias_size_param = | |||
init_convbias_opr_and_param(conv_bias_opr, param); | |||
static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) = | |||
conv_bias_size_param; | |||
auto conv_bias_preprocess_kerns = | |||
algo->dispatch_preprocess_kerns(conv_bias_opr, conv_bias_param); | |||
SmallVector<ConvolutionImpl::NCBKern> convolution_preprocess_kerns; | |||
//! Set the conv_bias param using convolution param | |||
auto set_copy_param_filter_workspace_ptr = | |||
[](const NCBKernParam& conv_param, | |||
::ConvBiasImpl::NCBKernParam& copied_param) { | |||
copied_param.filter_ptr = conv_param.filter_ptr; | |||
copied_param.workspace_ptr = conv_param.workspace_ptr; | |||
copied_param.workspace_size = conv_param.workspace_size; | |||
}; | |||
for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) { | |||
auto kernel = conv_bias_preprocess_kerns[i]; | |||
//! If the kerenl batch parallel | |||
auto run = [=](const NCBKernParam& p, | |||
const NCBKernIndex& ncb_index) { | |||
auto copy_param = conv_bias_param; | |||
set_copy_param_filter_workspace_ptr(p, copy_param); | |||
kernel.kern(copy_param, | |||
{ncb_index.thread_id, ncb_index.ndrange_id}); | |||
}; | |||
convolution_preprocess_kerns.push_back({run, kernel.global_size}); | |||
} | |||
return convolution_preprocess_kerns; | |||
} | |||
MIDOUT_END(); | |||
} | |||
//! Return the implement kernel | |||
SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | |||
::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | |||
const NCBKernSizeParam& param) { | |||
@@ -392,7 +452,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | |||
SmallVector<ConvolutionImpl::NCBKern> convolution_kerns; | |||
//! Set the conv_bias param using convolution param | |||
auto set_copy_param_run_time_address = | |||
auto set_copy_param_compute_address = | |||
[](const NCBKernParam& conv_param, | |||
::ConvBiasImpl::NCBKernParam& copied_param) { | |||
copied_param.src_ptr = conv_param.src_ptr; | |||
@@ -407,7 +467,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | |||
auto run = [=](const NCBKernParam& p, | |||
const NCBKernIndex& ncb_index) { | |||
auto copy_param = conv_bias_param; | |||
set_copy_param_run_time_address(p, copy_param); | |||
set_copy_param_compute_address(p, copy_param); | |||
kernel.kern(copy_param, | |||
{ncb_index.thread_id, ncb_index.ndrange_id}); | |||
}; | |||
@@ -110,6 +110,9 @@ class ConvolutionImpl::AlgoDefault final : public AlgoBase { | |||
static SmallVector<NCBKern> get_kimpl(ConvBiasImpl* conv_bias_opr, | |||
ConvBiasImpl::AlgoBase* algo, | |||
const NCBKernSizeParam& param); | |||
static SmallVector<NCBKern> get_preprocess_kimpl( | |||
ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | |||
const NCBKernSizeParam& param); | |||
public: | |||
AlgoDefault(fallback::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase*); | |||
@@ -121,6 +124,17 @@ public: | |||
size_t get_workspace(ConvolutionImpl* opr, | |||
const NCBKernSizeParam& param) const override; | |||
size_t get_preprocess_workspace(ConvolutionImpl*, | |||
const NCBKernSizeParam&) const override; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
ConvolutionImpl*, const NCBKernSizeParam&) const override; | |||
SmallVector<NCBKern> dispatch_preprocess_kern( | |||
ConvolutionImpl*, const NCBKernSizeParam& param) const override { | |||
return get_preprocess_kimpl(m_conv_bias_opr, m_algorithm, param); | |||
} | |||
SmallVector<NCBKern> dispatch_kern( | |||
ConvolutionImpl* /*opr*/, | |||
const NCBKernSizeParam& param) const override { | |||
@@ -80,14 +80,19 @@ SmallVector<ConvolutionImpl::AlgoBase*> ConvolutionImpl::algo_pack() { | |||
bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { | |||
return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | |||
} | |||
#define NCB_ALGO_FUNC(name, algo, param) \ | |||
static_cast<AlgoBase*>(algo)->name(this, fparam) | |||
void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
auto fparam = make_ncb_kern_param(src, filter, dst, workspace); | |||
auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | |||
workspace); | |||
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
if (!is_naive_algo(algo) && | |||
ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { | |||
exec_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvolutionForwardImpl::exec(src, filter, dst, | |||
@@ -95,24 +100,73 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
} | |||
} | |||
void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout, | |||
_megdnn_tensor_in filter, | |||
const TensorLayout& dst_layout, | |||
PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
//! exec_preprocess currently only support preprocess weights before exec, | |||
//! src/dst will be ignored, just set to nullptr | |||
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; | |||
auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | |||
workspace); | |||
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | |||
fparam) <= workspace.size) { | |||
exec_preprocess_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvolutionForwardImpl::exec_preprocess( | |||
src_layout, filter, dst_layout, preprocessed_filter, workspace); | |||
} | |||
} | |||
size_t ConvolutionImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||
auto fparam = | |||
make_ncb_kern_size_param(src, filter, dst, preprocessed_filter); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvolutionForwardImpl::get_workspace_in_bytes( | |||
src, filter, dst, preprocessed_filter); | |||
} else { | |||
return ncb_algo_get_workspace(algo, fparam); | |||
return static_cast<AlgoBase*>(algo)->get_workspace(this, fparam); | |||
} | |||
} | |||
size_t ConvolutionImpl::get_preprocess_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvolutionForwardImpl::get_preprocess_workspace_in_bytes( | |||
src, filter, dst); | |||
} else { | |||
return static_cast<AlgoBase*>(algo)->get_preprocess_workspace(this, | |||
fparam); | |||
} | |||
} | |||
SmallVector<TensorLayout> ConvolutionImpl::deduce_preprocessed_filter_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst){ | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvolutionForwardImpl::deduce_preprocessed_filter_layout( | |||
src, filter, dst); | |||
} else { | |||
return static_cast<AlgoBase*>(algo)->deduce_preprocessed_filter_layout( | |||
this, fparam); | |||
} | |||
} | |||
std::vector<ConvolutionImpl::Algorithm*> ConvolutionImpl::get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||
auto ret = get_all_algorithms_with_ncb(fparam); | |||
if (ret.empty()) { | |||
return naive::ConvolutionForwardImpl::get_all_algorithms(src, filter, | |||
@@ -125,7 +179,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||
auto result = get_algorithm_heuristic_with_ncb( | |||
fparam, workspace_limit_in_bytes, reproducible); | |||
if (result == nullptr) { | |||
@@ -137,7 +191,8 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( | |||
ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto safe_u32 = [](size_t v) -> uint32_t { | |||
megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | |||
"value too large: %zu", v); | |||
@@ -175,15 +230,17 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | |||
{src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | |||
{dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | |||
param().compute_mode, | |||
nr_threads}; | |||
nr_threads, | |||
preprocessed_filter}; | |||
} | |||
ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
NCBKernParam ret; | |||
static_cast<NCBKernSizeParam&>(ret) = | |||
make_ncb_kern_size_param(src.layout, filter.layout, dst.layout); | |||
static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param( | |||
src.layout, filter.layout, dst.layout, preprocessed_filter); | |||
ret.src_ptr = src.raw_ptr; | |||
ret.filter_ptr = filter.raw_ptr; | |||
ret.dst_ptr = dst.raw_ptr; | |||
@@ -192,9 +249,30 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( | |||
return ret; | |||
} | |||
void ConvolutionImpl::exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||
Algorithm* algo) { | |||
auto kerns = | |||
static_cast<AlgoBase*>(algo)->dispatch_preprocess_kern(this, param); | |||
auto fallback_handle = handle(); | |||
for (auto kernel : kerns) { | |||
megdnn_assert( | |||
param.filter_meta.format == Param::Format::NCHW || | |||
param.filter_meta.format == Param::Format::NHWC || | |||
param.filter_meta.format == Param::Format::NCHW88 || | |||
param.filter_meta.format == Param::Format::NCHW44, | |||
"invalid conv format"); | |||
auto run = [param, kernel](size_t index, size_t thread_id) { | |||
CpuNDRange ndrange_id(kernel.global_size, index); | |||
kernel.kern(param, {thread_id, ndrange_id}); | |||
}; | |||
static_cast<naive::HandleImpl*>(fallback_handle) | |||
->dispatch_kern(run, kernel.global_size.total_size()); | |||
} | |||
} | |||
void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||
Algorithm* algo) { | |||
auto kerns = ncb_algo_dispatch_kern(algo, param); | |||
auto kerns = static_cast<AlgoBase*>(algo)->dispatch_kern(this, param); | |||
auto fallback_handle = handle(); | |||
for (auto kernel : kerns) { | |||
megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | |||
@@ -215,10 +293,13 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic_with_ncb( | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible) { | |||
for (auto i : get_all_algorithms_with_ncb(param)) { | |||
if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||
this, param, AlgoSelectionStrategy::HEURISTIC, | |||
reproducible) && | |||
ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { | |||
size_t need_workspace = | |||
static_cast<AlgoBase*>(i)->get_workspace(this, param); | |||
bool usable_reproducible = | |||
static_cast<AlgoBase*>(i)->usable_reproducible( | |||
this, param, AlgoSelectionStrategy::HEURISTIC, | |||
reproducible); | |||
if (usable_reproducible && need_workspace <= workspace_limit_in_bytes) { | |||
return i; | |||
} | |||
} | |||
@@ -39,12 +39,26 @@ public: | |||
_megdnn_tensor_out dst, const PreprocessedFilter*, | |||
_megdnn_workspace workspace) override; | |||
void exec_preprocess(const TensorLayout& src_layout, | |||
_megdnn_tensor_in filter, | |||
const TensorLayout& dst_layout, | |||
PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
//! implemented by get_workspace_with_ncb() | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) override; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) override; | |||
size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) override; | |||
//! implemented by get_all_algorithms_with_ncb() | |||
std::vector<Algorithm*> get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
@@ -70,6 +84,8 @@ public: | |||
ptrdiff_t inp_s[4], out_s[4]; | |||
Param::ComputeMode compute_mode; | |||
size_t nr_threads; | |||
//! weight_preprocess info | |||
const PreprocessedFilter* preprocessed_filter; | |||
}; | |||
//! memory param for kernels with non-contiguous batch | |||
@@ -169,6 +185,23 @@ public: | |||
virtual SmallVector<NCBKern> dispatch_kern( | |||
ConvolutionImpl* opr, const NCBKernSizeParam& param) const = 0; | |||
virtual SmallVector<NCBKern> dispatch_preprocess_kern( | |||
ConvolutionImpl*, const NCBKernSizeParam&) const { | |||
return {}; | |||
}; | |||
//! get the layouts of weight_prerocess dst | |||
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
ConvolutionImpl*, const NCBKernSizeParam&) const { | |||
return {}; | |||
}; | |||
//! get the workspace when weight_prerocess | |||
virtual size_t get_preprocess_workspace(ConvolutionImpl*, | |||
const NCBKernSizeParam&) const { | |||
return 0_z; | |||
}; | |||
//! Temporarily used to identify whether the matmul algorithm is | |||
//! is_preferred. | |||
virtual bool is_preferred(ConvolutionImpl*, | |||
@@ -192,6 +225,9 @@ public: | |||
protected: | |||
virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); | |||
virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||
Algorithm* algo); | |||
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | |||
const NCBKernSizeParam& param); | |||
@@ -199,21 +235,6 @@ protected: | |||
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||
bool reproducible = false); | |||
//! get kernel pointer | |||
virtual SmallVector<NCBKern> ncb_algo_dispatch_kern( | |||
Algorithm* algo, const NCBKernSizeParam& param) { | |||
return static_cast<AlgoBase*>(algo)->dispatch_kern(this, param); | |||
} | |||
//! get algo workspace | |||
virtual size_t ncb_algo_get_workspace(Algorithm* algo, | |||
const NCBKernSizeParam& param) { | |||
return static_cast<AlgoBase*>(algo)->get_workspace(this, param); | |||
} | |||
/*! | |||
* the default impl iterates over all ncb_1g_get_all_algorithms() | |||
* and return the first one whose workspace does not exceed the limit. | |||
*/ | |||
const char* get_algorithm_set_name() const override; | |||
class AlgoFallback; | |||
@@ -231,14 +252,16 @@ private: | |||
const NCBKernSizeParam& param, | |||
size_t workspace_size = std::numeric_limits<size_t>::max()); | |||
NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst); | |||
NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace); | |||
NCBKernSizeParam make_ncb_kern_size_param( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter); | |||
NCBKernParam make_ncb_kern_param( | |||
_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace); | |||
}; | |||
class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl { | |||
@@ -80,14 +80,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { | |||
dt_byte *workspace_ptr = workspace.raw_ptr; | |||
// ============================w * f + b================================ | |||
auto filter_meta = check_exec(src.layout, filter.layout, bias.layout, | |||
z.layout, dst.layout, workspace.size); | |||
auto filter_meta = | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, | |||
dst.layout, workspace.size, preprocessed_filter); | |||
auto sfb = dst; | |||
if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { | |||
// intermediate result | |||
@@ -61,9 +61,7 @@ public: | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override{ | |||
megdnn_throw("conv_bias exec_preprocess is not impl yet"); | |||
} | |||
_megdnn_workspace) override {} | |||
const char* get_algorithm_set_name() const override; | |||
}; | |||
@@ -28,11 +28,11 @@ using namespace naive; | |||
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
MIDOUT_BEGIN(megdnn_naive_conv_fwd) { | |||
auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, | |||
workspace.size); | |||
workspace.size, preprocessed_filter); | |||
using ComputeMode = Param::ComputeMode; | |||
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ | |||
do { \ | |||
@@ -44,9 +44,7 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override { | |||
megdnn_throw("convolution exec_preprocess in not impl yet"); | |||
} | |||
_megdnn_workspace) override {} | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout& , const TensorLayout& , | |||
@@ -18,6 +18,9 @@ | |||
#include "test/common/workspace_wrapper.h" | |||
#include <algorithm> | |||
#include <memory> | |||
namespace megdnn { | |||
namespace test { | |||
@@ -33,6 +36,9 @@ template <typename Opr> | |||
struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | |||
template <typename Opr> | |||
struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | |||
template <typename Opr> | |||
struct OprProxyVectorToSingle {}; | |||
template <> | |||
@@ -139,6 +145,28 @@ struct OprProxyProfilingBase | |||
typename Opr::Algorithm* target_algo = nullptr; | |||
OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | |||
//! used for alloc tensor for weight preprocess | |||
static std::shared_ptr<TensorNDArray> alloc_tensors( | |||
Handle* handle, const TensorLayoutArray& layouts) { | |||
auto deleter = [handle](TensorNDArray* ptr) { | |||
for (auto&& i : *ptr) { | |||
auto pdata = static_cast<dt_byte*>(i.raw_ptr) + | |||
i.layout.span().low_byte; | |||
megdnn_free(handle, pdata); | |||
} | |||
delete ptr; | |||
}; | |||
std::shared_ptr<TensorNDArray> ret{new TensorNDArray, deleter}; | |||
for (size_t i = 0; i < layouts.size(); ++i) { | |||
auto span = layouts[i].span(); | |||
ret->emplace_back(static_cast<dt_byte*>( | |||
megdnn_malloc(handle, span.dist_byte())) - | |||
span.low_byte, | |||
layouts[i]); | |||
} | |||
return ret; | |||
} | |||
}; | |||
template <class Opr> | |||
@@ -207,7 +235,6 @@ DEF_PROF3(LocalShareBackwardData); | |||
DEF_PROF3(LocalShareBackwardFilter); | |||
#undef DEF_PROF3 | |||
//! TODO: it should adapt weight preprocess later | |||
template <> | |||
struct OprProxy<ConvolutionForward> | |||
: public OprProxyProfilingTernary<ConvolutionForward> { | |||
@@ -263,6 +290,100 @@ struct OprProxy<ConvolutionForward> | |||
} | |||
}; | |||
template <> | |||
struct OprWeightPreprocessProxy<ConvolutionForward> | |||
: public OprProxyProfilingTernary<ConvolutionForward> { | |||
using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary; | |||
void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == 3); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
} | |||
if (Base::m_profiling && !Base::target_algo) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
tensors[2].layout)) { | |||
opr->execution_policy().algorithm = algo; | |||
auto preprocess_tensors = weight_prerocess(opr, tensors, algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||
algo, *preprocess_tensors}; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
&preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
for (size_t times = 0; times < Base::warmup_times; ++times) | |||
opr->exec(tensors[0], tensors[1], tensors[2], | |||
&preprocessed_filter, Base::W.workspace()); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < Base::exec_times; ++times) { | |||
opr->exec(tensors[0], tensors[1], tensors[2], | |||
&preprocessed_filter, Base::W.workspace()); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
algo->name()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
Base::target_algo = algo; | |||
} | |||
} | |||
opr->execution_policy().algorithm = Base::target_algo; | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||
Base::target_algo, *preprocess_tensors}; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
&preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||
Base::target_algo, *preprocess_tensors}; | |||
if (!Base::target_algo) { | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
&preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter, | |||
Base::W.workspace()); | |||
} | |||
//! handle weight preprocess | |||
std::shared_ptr<TensorNDArray> weight_prerocess( | |||
ConvolutionForward* opr, const TensorNDArray& tensors, | |||
ConvolutionForward::Algorithm* algo) { | |||
auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout); | |||
auto preprocessed_filter_tensors_ptr = | |||
alloc_tensors(opr->handle(), weight_perprocess_layouts); | |||
ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||
algo, *preprocessed_filter_tensors_ptr}; | |||
size_t preprocess_workspace_size = | |||
opr->get_preprocess_workspace_in_bytes(tensors[0].layout, | |||
tensors[1].layout, | |||
tensors[2].layout); | |||
WorkspaceWrapper preprocess_workspace(opr->handle(), | |||
preprocess_workspace_size); | |||
opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, | |||
&preprocessed_filter, | |||
preprocess_workspace.workspace()); | |||
return preprocessed_filter_tensors_ptr; | |||
} | |||
}; | |||
template <class Opr> | |||
struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
@@ -329,11 +450,9 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
DEF_PROF5(DeformableConvForward); | |||
DEF_PROF5(DeformableConvBackwardFilter); | |||
//DEF_PROF5(ConvBiasForward); | |||
DEF_PROF5(BatchConvBiasForward); | |||
#undef DEF_PROF5 | |||
//! TODO: it should adapt weight preprocess later | |||
template <> | |||
struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | |||
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | |||
@@ -390,6 +509,106 @@ struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | |||
} | |||
}; | |||
template <> | |||
struct OprWeightPreprocessProxy<ConvBiasForward> | |||
: public OprProxyProfiling5<ConvBiasForward> { | |||
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | |||
void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == 5); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
} | |||
if (Base::m_profiling && !Base::target_algo) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
tensors[2].layout, tensors[3].layout, | |||
tensors[4].layout)) { | |||
opr->execution_policy().algorithm = algo; | |||
auto preprocess_tensors = weight_prerocess(opr, tensors, algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||
algo, *preprocess_tensors}; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, | |||
&preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
for (size_t times = 0; times < Base::warmup_times; ++times) | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
tensors[4], &preprocessed_filter, | |||
Base::W.workspace()); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < Base::exec_times; ++times) { | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
tensors[4], &preprocessed_filter, | |||
Base::W.workspace()); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
algo->name()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
Base::target_algo = algo; | |||
} | |||
} | |||
opr->execution_policy().algorithm = Base::target_algo; | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||
Base::target_algo, *preprocess_tensors}; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, &preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
auto preprocess_tensors = | |||
weight_prerocess(opr, tensors, Base::target_algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||
Base::target_algo, *preprocess_tensors}; | |||
if (!Base::target_algo) { | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, &preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], | |||
&preprocessed_filter, Base::W.workspace()); | |||
} | |||
//! handle weight preprocess | |||
std::shared_ptr<TensorNDArray> weight_prerocess( | |||
ConvBiasForward* opr, const TensorNDArray& tensors, | |||
ConvBiasForward::Algorithm* algo) { | |||
auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout); | |||
auto preprocessed_filter_tensors_ptr = | |||
alloc_tensors(opr->handle(), weight_perprocess_layouts); | |||
ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||
algo, *preprocessed_filter_tensors_ptr}; | |||
size_t preprocess_workspace_size = | |||
opr->get_preprocess_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout); | |||
WorkspaceWrapper preprocess_workspace(opr->handle(), | |||
preprocess_workspace_size); | |||
opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, | |||
&preprocessed_filter, | |||
preprocess_workspace.workspace()); | |||
return preprocessed_filter_tensors_ptr; | |||
} | |||
}; | |||
template <class Opr> | |||
struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> { | |||
using Base = OprProxyProfilingBase<Opr, 8>; | |||