GitOrigin-RevId: 84791aacf9
tags/v0.5.0
@@ -210,21 +210,25 @@ public: | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) = 0; | |||
virtual void exec_preprocess(const TensorLayout& src_layout, | |||
_megdnn_tensor_in filter, | |||
const TensorLayout& dst_layout, | |||
PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) = 0; | |||
void deduce_dtype(DType src, DType filter, DType& dst); | |||
void deduce_layout(const TensorLayout& src, const TensorLayout& filter, | |||
TensorLayout& dst); | |||
virtual size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
PreprocessedFilter* preprocessed_filter) = 0; | |||
const PreprocessedFilter* preprocessed_filter) = 0; | |||
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) = 0; | |||
virtual size_t get_preprocess_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) = 0; | |||
@@ -337,7 +341,7 @@ public: | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
PreprocessedFilter* preprocessed_filter) = 0; | |||
const PreprocessedFilter* preprocessed_filter) = 0; | |||
virtual size_t get_preprocess_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
@@ -76,7 +76,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
auto ret = check_layout_fwd(src, filter, dst); | |||
megdnn_assert_contiguous(bias); | |||
auto required_workspace_in_bytes = | |||
get_workspace_in_bytes(src, filter, bias, z, dst); | |||
get_workspace_in_bytes(src, filter, bias, z, dst, nullptr); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
if (bias.ndim != 0) { | |||
//! bias.layout == dst.layout failed, no assert information | |||
@@ -981,7 +981,8 @@ ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_in_bytes) { | |||
auto ret = check_layout_fwd(src, filter, dst); | |||
auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); | |||
auto required_workspace_in_bytes = | |||
get_workspace_in_bytes(src, filter, dst, nullptr); | |||
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
return ret; | |||
} | |||
@@ -112,7 +112,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
convbias_opr->execution_policy() = {m_impl}; | |||
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, | |||
fdst_tensor, cvter.workspace()); | |||
fdst_tensor, nullptr, cvter.workspace()); | |||
} | |||
{ cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } | |||
} | |||
@@ -25,6 +25,7 @@ namespace cuda { | |||
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
workspace.size); | |||
@@ -208,7 +209,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) { | |||
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; | |||
return get_algorithm(this, src, filter, bias, z, dst) | |||
->get_workspace_in_bytes(args); | |||
@@ -20,7 +20,9 @@ public: | |||
using ConvBiasForward::ConvBiasForward; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
std::vector<Algorithm*> get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
@@ -34,7 +36,30 @@ public: | |||
bool reproducible) override; | |||
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&) override; | |||
const TensorLayout&, | |||
const PreprocessedFilter*) override; | |||
size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&) override { | |||
return 0; | |||
}; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout&, const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, const TensorLayout&) override { | |||
return {}; | |||
} | |||
void exec_preprocess(const TensorLayout& , | |||
_megdnn_tensor_in , | |||
const TensorLayout& , | |||
const TensorLayout& , | |||
const TensorLayout& , | |||
PreprocessedFilter* , | |||
_megdnn_workspace ) override { | |||
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | |||
} | |||
const char* get_algorithm_set_name() const override; | |||
@@ -73,22 +73,32 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto extra_data = conv_bias_extra_data(dst); | |||
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
->get_workspace_in_bytes(src, filter, extra_data.bias_layout, | |||
extra_data.z_layout, dst); | |||
->get_workspace_in_bytes( | |||
src, filter, extra_data.bias_layout, extra_data.z_layout, | |||
dst, | |||
reinterpret_cast<const ConvolutionBase< | |||
param::ConvBias>::PreprocessedFilter*>( | |||
preprocessed_filter)); | |||
} | |||
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
auto extra_data = conv_bias_extra_data(dst.layout); | |||
TensorND bias(nullptr, extra_data.bias_layout); | |||
TensorND z(nullptr, extra_data.z_layout); | |||
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
->exec(src, filter, bias, z, dst, workspace); | |||
->exec(src, filter, bias, z, dst, | |||
reinterpret_cast<const ConvolutionBase< | |||
param::ConvBias>::PreprocessedFilter*>( | |||
preprocessed_filter), | |||
workspace); | |||
} | |||
const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||
@@ -11,6 +11,7 @@ | |||
#pragma once | |||
#include "megdnn/oprs/nn.h" | |||
#include "src/common/utils.h" | |||
namespace megdnn { | |||
namespace cuda { | |||
@@ -18,10 +19,11 @@ namespace cuda { | |||
class ConvolutionForwardImpl: public ConvolutionForward { | |||
public: | |||
using ConvolutionForward::ConvolutionForward; | |||
void exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) override; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src, | |||
const TensorLayout &filter, | |||
const TensorLayout &dst) override; | |||
@@ -30,11 +32,28 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||
const TensorLayout& dst, | |||
size_t workspace_limit_in_bytes, | |||
bool reproducible) override; | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) override; | |||
const char* get_algorithm_set_name() const override; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&) override { | |||
return {}; | |||
} | |||
size_t get_preprocess_workspace_in_bytes( | |||
const TensorLayout& , const TensorLayout& , | |||
const TensorLayout& ) override{ | |||
return 0; | |||
} | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override { | |||
megdnn_throw("cuda exec_preprocess has not implemeted yet"); | |||
} | |||
protected: | |||
struct ConvBiasExtraData{ | |||
std::unique_ptr<ConvBiasForward> convbias_opr; | |||
@@ -27,7 +27,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_workspace workspace) { | |||
megdnn_assert(dst.layout.dtype.enumv() == DTypeTrait<dtype::Float32>::enumv, | |||
"Mask conv only support Float32 dtype."); | |||
m_conv_opr->exec(src, filter, dst, workspace); | |||
m_conv_opr->exec(src, filter, dst, nullptr, workspace); | |||
auto stream = cuda_stream(handle()); | |||
#define cb(DType) \ | |||
if (mask.layout.dtype == DType()) { \ | |||
@@ -30,7 +30,7 @@ public: | |||
const TensorLayout& dst) override { | |||
MEGDNN_MARK_USED_VAR(mask); | |||
m_conv_opr->param() = param(); | |||
return m_conv_opr->get_workspace_in_bytes(src, filter, dst); | |||
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr); | |||
} | |||
private: | |||
@@ -95,7 +95,9 @@ bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { | |||
} | |||
void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
workspace.size); | |||
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace); | |||
@@ -104,20 +106,21 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
exec_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, workspace); | |||
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, | |||
preprocessed_filter, workspace); | |||
} | |||
} | |||
size_t ConvBiasImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
size_t ConvBiasImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(src, filter, | |||
bias, z, dst); | |||
return naive::ConvBiasForwardImpl::get_workspace_in_bytes( | |||
src, filter, bias, z, dst, preprocessed_filter); | |||
} else { | |||
return ncb_algo_get_workspace(algo, fparam); | |||
} | |||
@@ -41,14 +41,16 @@ public: | |||
//! implemented by exec_with_ncb_kern() | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
_megdnn_tensor_out dst, const PreprocessedFilter*, | |||
_megdnn_workspace workspace) override; | |||
//! implemented by get_workspace_with_ncb() | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) override; | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) override; | |||
//! implemented by get_all_algorithms_with_ncb() | |||
std::vector<Algorithm*> get_all_algorithms( | |||
@@ -82,6 +82,7 @@ bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { | |||
} | |||
void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
auto fparam = make_ncb_kern_param(src, filter, dst, workspace); | |||
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
@@ -89,18 +90,20 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
exec_with_ncb_kern(fparam, algo); | |||
} else { | |||
naive::ConvolutionForwardImpl::exec(src, filter, dst, workspace); | |||
naive::ConvolutionForwardImpl::exec(src, filter, dst, | |||
preprocessed_filter, workspace); | |||
} | |||
} | |||
size_t ConvolutionImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
size_t ConvolutionImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||
Algorithm* algo = get_algorithm(fparam); | |||
if (is_naive_algo(algo)) { | |||
return naive::ConvolutionForwardImpl::get_workspace_in_bytes( | |||
src, filter, dst); | |||
src, filter, dst, preprocessed_filter); | |||
} else { | |||
return ncb_algo_get_workspace(algo, fparam); | |||
} | |||
@@ -36,12 +36,14 @@ public: | |||
//! implemented by exec_with_ncb_kern() | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
_megdnn_tensor_out dst, const PreprocessedFilter*, | |||
_megdnn_workspace workspace) override; | |||
//! implemented by get_workspace_with_ncb() | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) override; | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) override; | |||
//! implemented by get_all_algorithms_with_ncb() | |||
std::vector<Algorithm*> get_all_algorithms( | |||
@@ -54,7 +54,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& flt, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) { | |||
size_t float_workspace_size = 0; | |||
if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) { | |||
@@ -79,6 +80,7 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
_megdnn_workspace workspace) { | |||
MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { | |||
dt_byte *workspace_ptr = workspace.raw_ptr; | |||
@@ -22,7 +22,9 @@ public: | |||
using ConvBiasForward::ConvBiasForward; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
std::vector<Algorithm*> get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
@@ -37,11 +39,32 @@ public: | |||
size_t workspace_limit_in_bytes, | |||
bool reproducible) override; | |||
size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& bias, | |||
const TensorLayout& z, | |||
const TensorLayout& dst) override; | |||
size_t get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) override; | |||
size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&) override { | |||
return 0; | |||
} | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout&, const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, const TensorLayout&) override { | |||
return {}; | |||
} | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override{ | |||
megdnn_throw("conv_bias exec_preprocess is not impl yet"); | |||
} | |||
const char* get_algorithm_set_name() const override; | |||
}; | |||
@@ -26,15 +26,14 @@ using namespace megdnn; | |||
using namespace naive; | |||
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) | |||
{ | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
_megdnn_workspace workspace) { | |||
MIDOUT_BEGIN(megdnn_naive_conv_fwd) { | |||
auto filter_meta = check_exec( | |||
src.layout, filter.layout, dst.layout, workspace.size); | |||
using ComputeMode = Param::ComputeMode; | |||
auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, | |||
workspace.size); | |||
using ComputeMode = Param::ComputeMode; | |||
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ | |||
do { \ | |||
using namespace dtype; \ | |||
@@ -52,24 +51,28 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
#define cb(dt) \ | |||
DISPATCH(dt, dt, DTypeTrait<dt>::ctype, DTypeTrait<dt>::ctype, \ | |||
DTypeTrait<dt>::ctype) | |||
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb); | |||
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb); | |||
#undef cb | |||
DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16); | |||
DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32); | |||
DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32); | |||
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, dt_float16, | |||
dt_float32, ComputeMode::FLOAT32)); | |||
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16, | |||
dt_bfloat16, dt_float32, | |||
ComputeMode::FLOAT32)); | |||
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, dt_qint32); | |||
DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32); | |||
DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16); | |||
DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32); | |||
DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32); | |||
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, | |||
dt_float16, dt_float32, | |||
ComputeMode::FLOAT32)); | |||
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16, | |||
dt_bfloat16, dt_float32, | |||
ComputeMode::FLOAT32)); | |||
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, | |||
dt_qint32); | |||
DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32); | |||
#undef DISPATCH | |||
megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d", | |||
src.layout.dtype.name(), filter.layout.dtype.name(), | |||
dst.layout.dtype.name(), | |||
static_cast<int>(param().compute_mode))); | |||
} MIDOUT_END(); | |||
megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d", | |||
src.layout.dtype.name(), | |||
filter.layout.dtype.name(), | |||
dst.layout.dtype.name(), | |||
static_cast<int>(param().compute_mode))); | |||
} | |||
MIDOUT_END(); | |||
} | |||
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, | |||
@@ -10,6 +10,7 @@ | |||
*/ | |||
#pragma once | |||
#include "megdnn/oprs.h" | |||
#include "src/common/utils.h" | |||
namespace megdnn { | |||
namespace naive { | |||
@@ -17,10 +18,10 @@ namespace naive { | |||
class ConvolutionForwardImpl: public ConvolutionForward { | |||
public: | |||
using ConvolutionForward::ConvolutionForward; | |||
void exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) override; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src, | |||
const TensorLayout &filter, | |||
const TensorLayout &dst) override; | |||
@@ -30,10 +31,29 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||
size_t workspace_limit_in_bytes, | |||
bool reproducible) override; | |||
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
const TensorLayout&) override { | |||
const TensorLayout&, | |||
const PreprocessedFilter*) override { | |||
return 0; | |||
} | |||
size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
const TensorLayout&, | |||
const TensorLayout&) override { | |||
return 0; | |||
} | |||
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
const TensorLayout&, PreprocessedFilter*, | |||
_megdnn_workspace) override { | |||
megdnn_throw("convolution exec_preprocess in not impl yet"); | |||
} | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const TensorLayout& , const TensorLayout& , | |||
const TensorLayout& )override{ | |||
return {}; | |||
} | |||
const char* get_algorithm_set_name() const override; | |||
}; | |||
@@ -97,7 +97,7 @@ void ConvPoolingForwardImpl::exec(const _megdnn_in TensorND src, | |||
TensorND conv_dst((float*)(workspace.raw_ptr), conv_dst_layout); | |||
//convFwd->check_layout(src.layout, filter.layout, workspace.layout, empty_wsp.layout); | |||
check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size); | |||
convFwd->exec(src, filter, conv_dst, empty_wsp); | |||
convFwd->exec(src, filter, conv_dst, nullptr, empty_wsp); | |||
// calculate bias | |||
int conv_dst_batch = conv_dst.layout.shape[0]; | |||
@@ -80,7 +80,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
_megdnn_workspace workspace) { | |||
MEGDNN_MARK_USED_VAR(mask); | |||
m_conv_opr->param() = this->param(); | |||
m_conv_opr->exec(src, filter, dst, workspace); | |||
m_conv_opr->exec(src, filter, dst, nullptr, workspace); | |||
#define cb(DType) \ | |||
if (mask.layout.dtype == DType()) { \ | |||
using ctype = typename DTypeTrait<DType>::ctype; \ | |||
@@ -99,7 +99,7 @@ size_t MaskConvForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
MEGDNN_MARK_USED_VAR(mask); | |||
m_conv_opr->param() = this->param(); | |||
return m_conv_opr->get_workspace_in_bytes(src, filter, dst); | |||
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr); | |||
} | |||
void MaskPropagateImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
@@ -103,7 +103,7 @@ void SeparableConvForwardImpl::exec(_megdnn_tensor_in src, | |||
ConvolutionForwardImpl* convOptr = new ConvolutionForwardImpl(this->handle()); | |||
Workspace empty_wsp; | |||
convOptr->exec(src, filter2d, dst, empty_wsp); | |||
convOptr->exec(src, filter2d, dst, nullptr, empty_wsp); | |||
delete(convOptr); | |||
free(filter2d_buf); | |||
@@ -664,7 +664,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
conv_bias_opr->get_workspace_in_bytes( | |||
tensors[0].layout, filter_transform_layout, | |||
tensors[2].layout, tensors[3].layout, | |||
tensors[4].layout); | |||
tensors[4].layout, nullptr); | |||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
conv_bias_workspace_in_bytes, | |||
@@ -676,7 +676,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
wb.get_workspace(2)); | |||
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
tensors[3], tensors[4], wb.get_workspace(1)); | |||
tensors[3], tensors[4], nullptr, | |||
wb.get_workspace(1)); | |||
free(wb.ptr()); | |||
}; | |||
@@ -1008,7 +1008,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
conv_bias_opr->param().output_block_size = m; | |||
size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( | |||
tensors[0].layout, filter_transform_layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout); | |||
tensors[3].layout, tensors[4].layout, nullptr); | |||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
conv_bias_workspace_in_bytes, | |||
@@ -1020,7 +1020,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
wb.get_workspace(2)); | |||
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
tensors[3], tensors[4], wb.get_workspace(1)); | |||
tensors[3], tensors[4], nullptr, wb.get_workspace(1)); | |||
free(wb.ptr()); | |||
}; | |||
@@ -200,15 +200,70 @@ struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> { | |||
using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \ | |||
} | |||
DEF_PROF3(ConvolutionForward); | |||
DEF_PROF3(ConvolutionBackwardData); | |||
DEF_PROF3(ConvolutionBackwardFilter); | |||
DEF_PROF3(LocalShareForward); | |||
DEF_PROF3(LocalShareBackwardData); | |||
DEF_PROF3(LocalShareBackwardFilter); | |||
#undef DEF_PROF3 | |||
//! TODO: it should adapt weight preprocess later | |||
template <> | |||
struct OprProxy<ConvolutionForward> | |||
: public OprProxyProfilingTernary<ConvolutionForward> { | |||
using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary; | |||
void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == 3); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
} | |||
if (Base::m_profiling && !Base::target_algo) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
tensors[2].layout)) { | |||
opr->execution_policy().algorithm = algo; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
nullptr); | |||
Base::W.update(workspace_size); | |||
for (size_t times = 0; times < Base::warmup_times; ++times) | |||
opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
Base::W.workspace()); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < Base::exec_times; ++times) { | |||
opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
Base::W.workspace()); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
algo->name()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
Base::target_algo = algo; | |||
} | |||
} | |||
opr->execution_policy().algorithm = Base::target_algo; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, nullptr); | |||
Base::W.update(workspace_size); | |||
} | |||
if (!Base::target_algo) { | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
nullptr); | |||
Base::W.update(workspace_size); | |||
} | |||
opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
Base::W.workspace()); | |||
} | |||
}; | |||
template <class Opr> | |||
struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
using Base = OprProxyProfilingBase<Opr, 5>; | |||
@@ -274,10 +329,67 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
DEF_PROF5(DeformableConvForward); | |||
DEF_PROF5(DeformableConvBackwardFilter); | |||
DEF_PROF5(ConvBiasForward); | |||
//DEF_PROF5(ConvBiasForward); | |||
DEF_PROF5(BatchConvBiasForward); | |||
#undef DEF_PROF5 | |||
//! TODO: it should adapt weight preprocess later | |||
template <> | |||
struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | |||
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | |||
void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == 5); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
} | |||
if (Base::m_profiling && !Base::target_algo) { | |||
size_t min_time = std::numeric_limits<size_t>::max(); | |||
for (auto algo : | |||
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
tensors[2].layout, tensors[3].layout, | |||
tensors[4].layout)) { | |||
opr->execution_policy().algorithm = algo; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, nullptr); | |||
Base::W.update(workspace_size); | |||
for (size_t times = 0; times < Base::warmup_times; ++times) | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
tensors[4], nullptr, Base::W.workspace()); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
Timer timer; | |||
timer.start(); | |||
for (size_t times = 0; times < Base::exec_times; ++times) { | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
tensors[4], nullptr, Base::W.workspace()); | |||
} | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
timer.stop(); | |||
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
algo->name()); | |||
if (min_time > timer.get_time_in_us()) { | |||
min_time = timer.get_time_in_us(); | |||
Base::target_algo = algo; | |||
} | |||
} | |||
opr->execution_policy().algorithm = Base::target_algo; | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, nullptr); | |||
Base::W.update(workspace_size); | |||
} | |||
if (!Base::target_algo) { | |||
auto workspace_size = opr->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
tensors[3].layout, tensors[4].layout, nullptr); | |||
Base::W.update(workspace_size); | |||
} | |||
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], | |||
nullptr, Base::W.workspace()); | |||
} | |||
}; | |||
template <class Opr> | |||
struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> { | |||
using Base = OprProxyProfilingBase<Opr, 8>; | |||
@@ -75,8 +75,8 @@ TEST_F(CPU, MASK_PROPAGATE) { | |||
auto dst = TensorND{dst_ptr, dst_layout}; | |||
WorkspaceWrapper workspace{ | |||
handle(), opr->get_workspace_in_bytes(src.layout, filter.layout, | |||
dst.layout)}; | |||
opr->exec(src, filter, dst, workspace.workspace()); | |||
dst.layout, nullptr)}; | |||
opr->exec(src, filter, dst, nullptr, workspace.workspace()); | |||
for (size_t i = 0; i < dst.layout.total_nr_elems(); ++i) { | |||
mask_dst.ptr<int>()[i] = dst_ptr[i] > 0; | |||
} | |||
@@ -176,6 +176,46 @@ public: | |||
} | |||
} | |||
//! special for weight preprocess | |||
void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) { | |||
opr0->param().pad_h = pad_h; | |||
opr0->param().pad_w = pad_w; | |||
opr1->param() = opr0->param(); | |||
opr1->param().sparse = param::Convolution::Sparse::GROUP; | |||
TensorND a0, b0, c0, a1, b1, c1; | |||
std::tie(a0, b0, c0) = shuffle(std::make_tuple( | |||
src0->tensornd(), flt0->tensornd(), dst0->tensornd())); | |||
std::tie(a1, b1, c1) = shuffle(std::make_tuple( | |||
src1->tensornd(), flt1->tensornd(), dst1->tensornd())); | |||
WorkspaceWrapper wk( | |||
handle, | |||
std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout, | |||
c0.layout, nullptr), | |||
opr1->get_workspace_in_bytes(a1.layout, b1.layout, | |||
c1.layout, nullptr))); | |||
cudaProfilerStart(); | |||
cudaEventRecord(cuda_ev[0], cuda_stream); | |||
opr0->exec(a0, b0, c0, nullptr, wk.workspace()); | |||
cudaEventRecord(cuda_ev[1], cuda_stream); | |||
opr1->exec(a1, b1, c1, nullptr, wk.workspace()); | |||
cudaEventRecord(cuda_ev[2], cuda_stream); | |||
cudaProfilerStop(); | |||
if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") || | |||
getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) { | |||
cudaStreamSynchronize(cuda_stream); | |||
float t0 = -1, t1 = -1; | |||
cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]); | |||
cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]); | |||
printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n", | |||
lsrc.TensorShape::to_string().c_str(), | |||
lflt1.TensorShape::to_string().c_str(), | |||
ldst.TensorShape::to_string().c_str(), | |||
t0, t1, t0 / t1); | |||
} | |||
} | |||
void cmp_dst() { | |||
Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst); | |||
megdnn_memcpy_D2H(handle, | |||
@@ -399,7 +439,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | |||
benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW); | |||
benv.fill_src(); | |||
benv.fill_flt(); | |||
benv.exec(conv0.get(), conv1.get()); | |||
benv.exec_convolution(conv0.get(), conv1.get()); | |||
benv.cmp_dst(); | |||
}; | |||
@@ -30,10 +30,10 @@ TEST(DISPATCHER, NULL_DISPATCHER) | |||
auto layout = TensorLayout({1, 1, 1, 1}, dtype::Float32()); | |||
TensorND src(nullptr, layout), filter(nullptr, layout), dst(nullptr, layout); | |||
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout); | |||
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout, nullptr); | |||
Workspace workspace(nullptr, wsize); | |||
opr->exec(src, filter, dst, workspace); | |||
opr->exec(src, filter, dst, nullptr, workspace); | |||
} | |||
#endif | |||
@@ -217,11 +217,11 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED8x8x32_NCHW32) { | |||
size_t ws_size = conv_opr->get_workspace_in_bytes( | |||
src_layout_4, filter_layout_4, bias_layout_4, z_layout_4, | |||
dst_layout_4); | |||
dst_layout_4, nullptr); | |||
WorkspaceWrapper ws{handle(), ws_size}; | |||
conv_opr->exec(src_ts_4.tensornd(), filter_ts_4.tensornd(), | |||
bias_ts_4.tensornd(), z_ts_4.tensornd(), dst_ts_4.tensornd(), | |||
ws.workspace()); | |||
nullptr, ws.workspace()); | |||
TensorLayout src_layout_32{{N, IC / 32, IH, IW, 32}, | |||
dtype::QuantizedS8(0.1f)}; | |||
@@ -209,7 +209,8 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { | |||
} | |||
auto workspace_size = conv->get_workspace_in_bytes( | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout); | |||
tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
nullptr); | |||
dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size)); | |||
Workspace workspace{workspace_ptr, workspace_size}; | |||
@@ -217,7 +218,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { | |||
relayout->exec(nchw4_tensors[0], nchw_tensors[0]); | |||
relayout->exec(nchw4_tensors[1], nchw_tensors[1]); | |||
conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], | |||
conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr, | |||
workspace); | |||
relayout->exec(nchw_tensors[2], nchw4_tensors[2]); | |||
@@ -1334,8 +1334,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { | |||
size_t conv_bias_workspace_in_bytes = | |||
conv_bias_opr->get_workspace_in_bytes( | |||
tensors[0].layout, filter_transform_layout, | |||
tensors[2].layout, tensors[3].layout, | |||
tensors[4].layout); | |||
tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||
nullptr); | |||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
conv_bias_workspace_in_bytes, | |||
@@ -1347,7 +1347,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { | |||
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
wb.get_workspace(2)); | |||
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
tensors[3], tensors[4], wb.get_workspace(1)); | |||
tensors[3], tensors[4], nullptr, | |||
wb.get_workspace(1)); | |||
free(wb.ptr()); | |||
}; | |||