@@ -561,7 +561,8 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd( | |||
src.enumv() == DTypeEnum::QuantizedS8 || | |||
src.enumv() == DTypeEnum::Quantized8Asymm || | |||
src.enumv() == DTypeEnum::QuantizedS4 || | |||
src.enumv() == DTypeEnum::Quantized4Asymm) { | |||
src.enumv() == DTypeEnum::Quantized4Asymm || | |||
src.enumv() == DTypeEnum::QuantizedS1) { | |||
supported_dst_dtype.push_back(dtype::QuantizedS32(mul_scale(src, filter))); | |||
bool cond_dst = dst.valid() && (dst.enumv() == src.enumv() || | |||
((dst.enumv() == DTypeEnum::QuantizedS4 || | |||
@@ -25,7 +25,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||
non_cudnn_algos.push_back(&matmul); | |||
non_cudnn_algos.push_back(&matmul8x8x32); | |||
non_cudnn_algos.push_back(&batched_matmul); | |||
non_cudnn_algos.push_back(&int1_simple); | |||
fill_cudnn_algos(); | |||
for (auto&& algo : cudnn_conv_bias_activations) { | |||
all_algos.push_back(&algo); | |||
@@ -45,6 +45,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||
conv_algos.push_back(&matmul8x8x32); | |||
conv_algos.push_back(&batched_matmul); | |||
conv_algos.push_back(&group); | |||
conv_algos.push_back(&int1_simple); | |||
for (auto&& algo : conv_algos) { | |||
all_algos.push_back(algo); | |||
@@ -87,6 +87,7 @@ public: | |||
CUDA_FALLBACK_NCHW_INT4, | |||
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||
CUDA_SIMPLE_INT1, | |||
}; | |||
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||
@@ -1089,6 +1090,24 @@ private: | |||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||
}; | |||
class ConvBiasForwardImpl::AlgoSimpleInt1 final : public AlgoBase { | |||
public: | |||
bool is_available(const SizeArgs& args) const override; | |||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||
void exec(const ExecArgs& args) const override; | |||
std::vector<SearchItem> get_subopr_list( | |||
const TensorLayoutArray& layouts, const OperatorBase* opr) const override; | |||
const char* name() const override { return "CONVBIAS_SIMPLE_INT1"; } | |||
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||
MEGDNN_DECL_ALGO_TYPE(CUDA_SIMPLE_INT1) | |||
private: | |||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||
}; | |||
class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { | |||
private: | |||
AlgoBase::Mapper m_all_algos_map; | |||
@@ -1132,6 +1151,7 @@ public: | |||
std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | |||
AlgoGroupConvGeneral group; | |||
AlgoBFloat16 bfloat16; | |||
AlgoSimpleInt1 int1_simple; | |||
AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | |||
@@ -30,6 +30,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( | |||
return false; | |||
} | |||
} | |||
if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||
return false; | |||
if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||
args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | |||
@@ -134,6 +134,9 @@ void ConvBiasDesc::set_conv( | |||
namespace conv_bias { | |||
bool is_cudnn_supported(const BiasForwardSizeArgs& args) { | |||
if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||
return false; | |||
if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||
args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | |||
@@ -221,6 +221,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||
return &sm_algo_pack.fallback_nchw_qs8; | |||
} | |||
if (sm_algo_pack.int1_simple.is_available_attribute( | |||
args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | |||
return &sm_algo_pack.int1_simple; | |||
} | |||
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | |||
return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>( | |||
sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, | |||
@@ -72,6 +72,7 @@ public: | |||
class AlgoInt4Int4NHWCIMMAImplicitGemm; | |||
class AlgoUInt4Int4NHWCIMMAImplicitGemm; | |||
class AlgoBFloat16; | |||
class AlgoSimpleInt1; | |||
// The following algorithms are suitable for channel wise convolution | |||
class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||
class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||
@@ -0,0 +1,145 @@ | |||
/** | |||
* \file dnn/src/cuda/conv_bias/simple_int1.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/common/algo_base.h" | |||
#include "src/cuda/conv_bias/algo.h" | |||
#include "src/cuda/handle.h" | |||
#include "src/cuda/utils.cuh" | |||
#include "src/cuda/utils.h" | |||
using namespace megdnn; | |||
using namespace cuda; | |||
using namespace conv_bias; | |||
namespace { | |||
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config( | |||
const TensorLayoutArray& layouts, const ConvBiasForwardImpl* opr) { | |||
megdnn_assert(layouts.size() >= 3); | |||
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> ret; | |||
ret.first = layouts; | |||
auto change_dtype = [](TensorLayout& layout) { | |||
if (layout.dtype.enumv() == DTypeEnum::QuantizedS1 || | |||
layout.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||
layout.dtype = dtype::Float32(); | |||
} | |||
}; | |||
change_dtype(ret.first[0]); | |||
change_dtype(ret.first[1]); | |||
change_dtype(ret.first[2]); | |||
change_dtype(ret.first[3]); | |||
change_dtype(ret.first[4]); | |||
ret.second = opr->param(); | |||
ret.second.compute_mode = ConvBiasForwardImpl::Param::ComputeMode::DEFAULT; | |||
return ret; | |||
} | |||
std::pair<TensorLayoutArray, std::unique_ptr<ConvBiasForward>> prepare_sub_opr( | |||
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { | |||
auto convbias_opr = args.handle->create_operator<ConvBias>(); | |||
auto&& config = sub_opr_config( | |||
{*args.src_layout, *args.filter_layout, *args.bias_layout, *args.z_layout, | |||
*args.dst_layout}, | |||
args.opr); | |||
convbias_opr->param() = config.second; | |||
return {config.first, std::move(convbias_opr)}; | |||
} | |||
} // namespace | |||
std::vector<Algorithm::SearchItem> ConvBiasForwardImpl::AlgoSimpleInt1::get_subopr_list( | |||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||
auto&& config = | |||
sub_opr_config(layouts, static_cast<const ConvBiasForwardImpl*>(opr)); | |||
std::string param_str; | |||
Algorithm::serialize_write_pod(config.second, param_str); | |||
return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; | |||
} | |||
bool ConvBiasForwardImpl::AlgoSimpleInt1::is_available(const SizeArgs& args) const { | |||
if (args.src_layout->dtype.valid() && args.filter_layout->dtype.valid() && | |||
args.bias_layout->dtype.valid() && args.z_layout->dtype.valid() && | |||
args.dst_layout->dtype.valid()) { | |||
auto config = prepare_sub_opr(args); | |||
return args.src_layout->dtype.enumv() == args.filter_layout->dtype.enumv() && | |||
args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1 && | |||
get_algorithm( | |||
static_cast<ConvBiasForwardImpl*>(config.second.get()), | |||
config.first[0], config.first[1], config.first[2], | |||
config.first[3], config.first[4]); | |||
} else { | |||
return false; | |||
} | |||
} | |||
WorkspaceBundle ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_bundle( | |||
void* ptr, const SizeArgs& args) const { | |||
auto config = prepare_sub_opr(args); | |||
SmallVector<size_t> sizes; | |||
auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { | |||
if (src.dtype != dst.dtype) { | |||
sizes.push_back(dst.span().dist_byte()); | |||
} | |||
}; | |||
get_workspace(*args.src_layout, config.first[0]); | |||
get_workspace(*args.filter_layout, config.first[1]); | |||
get_workspace(*args.bias_layout, config.first[2]); | |||
get_workspace(*args.z_layout, config.first[3]); | |||
get_workspace(*args.dst_layout, config.first[4]); | |||
sizes.push_back(config.second->get_workspace_in_bytes( | |||
config.first[0], config.first[1], config.first[2], config.first[3], | |||
config.first[4], nullptr)); | |||
return {ptr, std::move(sizes)}; | |||
} | |||
size_t ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_in_bytes( | |||
const SizeArgs& args) const { | |||
return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||
} | |||
void ConvBiasForwardImpl::AlgoSimpleInt1::exec(const ExecArgs& args) const { | |||
TensorND fsrc_tensor = *args.src_tensor; | |||
TensorND ffilter_tensor = *args.filter_tensor; | |||
TensorND fbias_tensor = *args.bias_tensor; | |||
TensorND fz_tensor = *args.z_tensor; | |||
TensorND fdst_tensor = *args.dst_tensor; | |||
auto config = prepare_sub_opr(args); | |||
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||
CompTypeCvter<dtype::QuantizedS1, dtype::Float32> cvter(args.handle, &bundle); | |||
{ | |||
cvter.src_to_comp_type(*args.src_tensor, fsrc_tensor) | |||
.src_to_comp_type(*args.filter_tensor, ffilter_tensor); | |||
} | |||
WorkspaceBundle dst_bundle = { | |||
bundle.get(2), | |||
{bundle.get_size(2), bundle.get_size(3), bundle.get_size(4), | |||
bundle.get_size(5)}}; | |||
CompTypeCvter<dtype::QuantizedS32, dtype::Float32> dst_cvter( | |||
args.handle, &dst_bundle); | |||
{ | |||
dst_cvter.src_to_comp_type(*args.bias_tensor, fbias_tensor) | |||
.src_to_comp_type(*args.z_tensor, fz_tensor) | |||
.src_to_comp_type(*args.dst_tensor, fdst_tensor); | |||
} | |||
config.second->exec( | |||
fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, | |||
dst_cvter.workspace()); | |||
{ dst_cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -44,6 +44,10 @@ std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config( | |||
src.dtype.param<dtype::Quantized4Asymm>().scale * | |||
filter.dtype.param<dtype::Quantized4Asymm>().scale); | |||
} else if (src.dtype.enumv() == DTypeEnum::QuantizedS1) { | |||
bias_type = dtype::QuantizedS32( | |||
src.dtype.param<dtype::QuantizedS1>().scale * | |||
filter.dtype.param<dtype::QuantizedS1>().scale); | |||
} else { | |||
megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | |||
bias_type = src.dtype; | |||
@@ -278,6 +278,9 @@ void ConvBiasForwardImpl::exec( | |||
DISPATCH_RAW( | |||
Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, DEFAULT, | |||
(convolution::forward_bias<dt_quint4, dt_qint4, dt_qint32, dt_qint32>)) | |||
DISPATCH_RAW( | |||
QuantizedS1, QuantizedS1, QuantizedS32, QuantizedS32, FLOAT32, | |||
(convolution::forward_bias<dt_qint1, dt_qint1, dt_qint32, dt_qint32>)) | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
DISPATCH(Float16, Float16) | |||
DISPATCH_RAW( | |||
@@ -84,6 +84,15 @@ inline void StrategyFwd::on( | |||
d += cast(s) * cast(f); | |||
} | |||
template <> | |||
inline void StrategyFwd::on( | |||
dt_qint1& s, dt_qint1& f, dt_qint32& d, DType, DType, DType) { | |||
auto cast = [](const dt_qint1& val) { | |||
return dt_qint32(static_cast<int32_t>(val.as_int8())); | |||
}; | |||
d += cast(s) * cast(f); | |||
} | |||
struct StrategyBwdData { | |||
template <typename st, typename ft, typename dt> | |||
static void on(st& s, ft& f, dt& d, DType, DType, DType) { | |||
@@ -133,6 +133,32 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_BF16) { | |||
} | |||
} | |||
TEST_F(CUDA, CONV_BIAS_FORWARD_QS1) { | |||
require_compute_capability(6, 1); | |||
UniformIntRNG int_rng{1, 1}; | |||
Checker<ConvBiasForward> checker(handle_cuda()); | |||
checker.set_before_exec_callback(AlgoChecker<ConvBiasForward>( | |||
ExecutionPolicyAlgoName{"CONVBIAS_SIMPLE_INT1", {{"MATMUL", {}}}})); | |||
ConvBias::Param param; | |||
param.format = ConvBias::Param::Format::NCHW; | |||
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
{ | |||
auto src_shape = TensorShape{20, 2, 224, 224}; | |||
auto filter_shape = TensorShape{20, 2, 3, 3}; | |||
checker.set_dtype(0, dtype::QuantizedS1(1.0f)) | |||
.set_dtype(1, dtype::QuantizedS1(1.0f)) | |||
.set_dtype(2, dtype::QuantizedS32(1.0f)) | |||
.set_dtype(3, dtype::QuantizedS32(1.0f)) | |||
.set_dtype(4, dtype::QuantizedS32(1.0f)) | |||
.set_rng(0, &int_rng) | |||
.set_rng(1, &int_rng) | |||
.set_param(param) | |||
.execs({src_shape, filter_shape, {}, {}, {}}); | |||
} | |||
} | |||
TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { | |||
require_compute_capability(6, 1); | |||