OpenI
/
MegEngine

/**
 * \file dnn/src/fallback/convolution/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#pragma once

#include <memory>
#include <unordered_map>
#include "megdnn/oprs/base.h"
#include "src/common/utils.h"
#include "src/common/algo_base.h"
#include "src/fallback/handle.h"
#include "src/naive/convolution/opr_impl.h"

namespace megdnn {

/**
 * \brief Convolutino algo category
 */
enum class AlgoCategory : int32_t {
    DIRECT = 0,
    IM2COL = 1,
    WINOGRAD = 2,
    NAIVE = 3,
};

struct ConvAlgoTypePack {
    detail::AlgoDataType data_type : 32;
    AlgoCategory algo_category : 32;
};

namespace fallback {

/*!
 * \brief fallback convolution forward impl
 *
 * Note: this operator class serves for multiple purposes:
 *
 *  1. canonizing conv reprs into NCBKernParam and NCBKernSizeParam, and
 *     subclasses should impl by overriding *_ncb methods
 *  2. providing a default impl for group conv by calling ncb_1g* methods
 *  3. providing a conv impl faster than naive under some cases
 *  4. providing a default impl for choosing heuristic algorithm, by using the
 *     first algo that fits the workspace limit
 */
class ConvolutionImpl : public naive::ConvolutionForwardImpl {
public:
    using naive::ConvolutionForwardImpl::ConvolutionForwardImpl;
    using AlgoSelectionStrategy = detail::AlgoSelectionStrategy;
    using AlgoDataType = detail::AlgoDataType;

    //! implemented by exec_with_ncb_kern()
    void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
              _megdnn_tensor_out dst, const PreprocessedFilter*,
              _megdnn_workspace workspace) override;

    void exec_preprocess(const TensorLayout& src_layout,
                         _megdnn_tensor_in filter,
                         const TensorLayout& dst_layout,
                         PreprocessedFilter* preprocessed_filter,
                         _megdnn_workspace workspace) override;

    //! implemented by get_workspace_with_ncb()
    size_t get_workspace_in_bytes(const TensorLayout& src,
                                  const TensorLayout& filter,
                                  const TensorLayout& dst,
                                  const PreprocessedFilter*) override;

    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst) override;

    size_t get_preprocess_workspace_in_bytes(const TensorLayout& src,
                                             const TensorLayout& filter,
                                             const TensorLayout& dst) override;

    //! implemented by get_all_algorithms_with_ncb()
    std::vector<Algorithm*> get_all_algorithms(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst) override;

    std::vector<Algorithm*> get_all_algorithms_safe(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst) override;

    //! implemented by get_algorithm_heuristic_with_ncb()
    Algorithm* get_algorithm_heuristic(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst, size_t workspace_limit_in_bytes,
            const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr) override;

    //! size param for kernels with non-contiguous batch
    struct NCBKernSizeParam {
        uint32_t n;
        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
        //! filter info; group is guaranteed to be 1
        CanonizedFilterMeta filter_meta;
        DType src_type, filter_type, dst_type;
        //! stride for batch of input, output
        ptrdiff_t inp_bs, out_bs;
        //! stride for each dim of input, output
        ptrdiff_t inp_s[4], out_s[4];
        Param::ComputeMode compute_mode;
        size_t nr_threads;
        //! weight_preprocess info
        const PreprocessedFilter* preprocessed_filter;
        //! get the data type category of the param for select the algo
        AlgoDataType deduce_algo_data_type() const;
    };

    //! memory param for kernels with non-contiguous batch
    struct NCBKernParam : public NCBKernSizeParam {
        const void* src_ptr;
        const void* filter_ptr;
        void* dst_ptr;
        void* workspace_ptr;
        size_t workspace_size;

        template <typename T>
        const T* src() const {
            src_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(src_ptr);
        }

        template <typename T>
        const T* filter() const {
            filter_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(filter_ptr);
        }

        template <typename T>
        T* dst() const {
            dst_type.assert_is_compatible_ctype<T>();
            return static_cast<T*>(dst_ptr);
        }

        template <typename T>
        T* workspace() const {
            return static_cast<T*>(workspace_ptr);
        }

        //! when format is nchwxx and channel wise, multi group will pack into
        //! one group_pack_id. group_pack_size is the number of packed group
        //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
        template <typename T>
        T* dst(size_t batch_id, size_t group_pack_id,
               size_t group_pack_size = 1_z) const{
            size_t batch_offset = batch_id * out_bs * dst_type.size();
            size_t group_offset = group_pack_size * group_pack_id *
                                  filter_meta.ocpg * osz[0] * osz[1] *
                                  dst_type.size();
            return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(dst_ptr) +
                                        batch_offset + group_offset);
        }

        template <typename T>
        const T* src(size_t batch_id, size_t group_pack_id,
                     size_t group_pack_size = 1_z) const {
            size_t batch_offset = batch_id * inp_bs * src_type.size();
            size_t group_offset = group_pack_size * group_pack_id *
                                  filter_meta.icpg * isz[0] * isz[1] *
                                  src_type.size();
            return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(src_ptr) +
                                        batch_offset + group_offset);

        }

        template <typename T>
        const T* filter(size_t group_pack_id,
                        size_t pack_group_size = 1_z) const {
            size_t group_offset = pack_group_size * group_pack_id *
                                  filter_meta.icpg * filter_meta.ocpg *
                                  filter_meta.spatial[0] *
                                  filter_meta.spatial[1] * filter_type.size();
            return reinterpret_cast<T*>(
                    reinterpret_cast<ptrdiff_t>(filter_ptr) + group_offset);
        }
    };

    /**
     * \brief Kernel run time id, This information is used for getting the
     * work data
     */
    struct NCBKernIndex {
        size_t thread_id = 0;  //!< Thread id
        CpuNDRange ndrange_id;
    };

    using ncb_kern_t = thin_function<void(const NCBKernParam& param,
                                          const NCBKernIndex& ncb_index)>;
    struct NCBKern {
        ncb_kern_t kern;  //!< conv kern parallel ptr
        CpuNDRange global_size;
    };

    class AlgoBase : public Algorithm {
    public:
        AlgoBase() : Algorithm() {
            m_handle_type = Handle::HandleType::FALLBACK;
        }

        enum class AlgoType : uint32_t {
            //! fallback
            FB_ALGO = 1 << 0,
            FB_NAIVE,
            FB_DEFAULT,
        };

        virtual ~AlgoBase() = default;
        virtual bool usable(const NCBKernSizeParam& param,
                            AlgoSelectionStrategy) const = 0;
        virtual size_t get_workspace(const NCBKernSizeParam& param) const = 0;
        virtual SmallVector<NCBKern> dispatch_kern(
                const NCBKernSizeParam& param) const = 0;

        virtual SmallVector<NCBKern> dispatch_preprocess_kern(
                const NCBKernSizeParam&) const {
            return {};
        };

        //! get the layouts of weight_prerocess dst
        virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
                const NCBKernSizeParam&) const {
            return {};
        };

        //! get the workspace when weight_prerocess
        virtual size_t get_preprocess_workspace(const NCBKernSizeParam&) const {
            return 0_z;
        };

        //! Temporarily used to identify whether the matmul algorithm is
        //! is_preferred.
        virtual bool is_preferred(const NCBKernSizeParam&) const {
            return false;
        }

        bool usable_attribute(const NCBKernSizeParam& param,
                              AlgoSelectionStrategy algo_selection_strategy,
                              const AlgoAttribute& positive_attr =
                                      AlgoAttribute::REPRODUCIBLE,
                              const AlgoAttribute& negative_attr =
                                      AlgoAttribute::DEFAULT) const {
            return contain_attribute_all(positive_attr) &&
                   !contain_attribute_any(negative_attr) &&
                   usable(param, algo_selection_strategy);
        }

        //! get the type of the algo
        virtual ConvAlgoTypePack get_algo_type() const = 0;
        using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
    };

    /**
     * \brief get all the algorithm for the opr.
     */
    virtual SmallVector<AlgoBase*> get_all_packed_algo();

    /**
     * \brief select algo according to input algo type
     */
    SmallVector<AlgoBase*> select_algo_type(ConvAlgoTypePack algo_type);

protected:
    virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo);

    virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param,
                                               Algorithm* algo);

    virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
            const NCBKernSizeParam& param);

    virtual Algorithm* get_algorithm_heuristic_with_ncb(
            const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
            const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr);

    const char* get_algorithm_set_name() const override;

    class AlgoFallback;
    class AlgoNaive;
    class AlgoDefault;
    class AlgoPack;

private:

    NCBKernSizeParam m_prev_selected_algo_sizep;
    Algorithm* m_prev_selected_algo = nullptr;

    Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
    bool is_naive_algo(ConvolutionImpl::Algorithm* algo);
    Algorithm* get_algorithm(
            const NCBKernSizeParam& param,
            size_t workspace_size = std::numeric_limits<size_t>::max());

    NCBKernSizeParam make_ncb_kern_size_param(
            const TensorLayout& src, const TensorLayout& filter,
            const TensorLayout& dst,
            const PreprocessedFilter* preprocessed_filter);

    NCBKernParam make_ncb_kern_param(
            _megdnn_tensor_in src, _megdnn_tensor_in filter,
            _megdnn_tensor_out dst,
            const PreprocessedFilter* preprocessed_filter,
            _megdnn_workspace workspace);

    SmallVector<AlgoCategory> suggest_algo_category_order(
            const NCBKernSizeParam& param) const;

public:
    static const AlgoPack& algo_pack();
};

class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl {
public:
    using naive::ConvolutionBackwardDataImpl::ConvolutionBackwardDataImpl;

    void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff,
              _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout& flter,
                                  const TensorLayout& diff,
                                  const TensorLayout& grad) override;
    std::vector<Algorithm*> get_all_algorithms(
            const TensorLayout& filter, const TensorLayout& diff,
            const TensorLayout& grad) override;
    std::vector<Algorithm*> get_all_algorithms_safe(
            const TensorLayout& filter, const TensorLayout& diff,
            const TensorLayout& grad) override;
    Algorithm* get_algorithm_heuristic(
            const TensorLayout& filter, const TensorLayout& diff,
            const TensorLayout& grad, size_t workspace_limit_in_bytes,
            const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr) override;
    const char* get_algorithm_set_name() const override;

    //! size param for kernels with non-contiguous batch
    struct NCBKernSizeParam {
        uint32_t n;
        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
        //! filter info; group is guaranteed to be 1
        CanonizedFilterMeta filter_meta;
        DType diff_type, filter_type, grad_type;
        TensorLayout diff_layout, filter_layout, grad_layout;
        //! stride for batch of input, output
        ptrdiff_t inp_bs, out_bs;
        //! extra_mem_size (in bytes) memory after the end of the logical
        //! memory block is accessible.
        //!
        //! this allows for eliminating unnecessary memory copies: e.g.
        //! if several bytes after the end of the tensor are
        //! accessible, some kernel implementations can utilize
        //! out-of-bound SIMD memory access, to avoid issuing
        //! memcpy instructions.
        //!
        //! Note that although extra_mem_size bytes are accessible by the
        //! kernel implementation, kernel implementation should not have any
        //! ``visible'' effect on any unintended memory location.
        //! This means reading and writing the same value to some memory
        //! location within extra_mem_size is allowed, but writing a
        //! different value is not allowed.
        size_t diff_extra_mem_size, filter_extra_mem_size, grad_extra_mem_size;
        Param::ComputeMode compute_mode;
    };

    //! memory param for kernels with non-contiguous batch
    struct NCBKernParam : public NCBKernSizeParam {
        const void* filter_ptr;
        const void* diff_ptr;
        void* grad_ptr;
        void* workspace_ptr;
        size_t workspace_size;

        template <typename T>
        const T* diff() const {
            diff_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(diff_ptr);
        }

        template <typename T>
        const T* filter() const {
            filter_type.assert_is_compatible_ctype<T>();
            return static_cast<const T*>(filter_ptr);
        }

        template <typename T>
        T* grad() const {
            grad_type.assert_is_compatible_ctype<T>();
            return static_cast<T*>(grad_ptr);
        }

        template <typename T>
        T* workspace() const {
            return static_cast<T*>(workspace_ptr);
        }
    };

protected:
    using ncb_kern_t = thin_function<void(const NCBKernParam& param)>;
    class AlgoBase : public Algorithm {
    protected:
        ~AlgoBase() = default;

    public:
        AlgoBase() : Algorithm() {
            m_handle_type = Handle::HandleType::FALLBACK;
        }
        enum class AlgoType : uint32_t {
            //! fallback
            FB_NAIVE = 1 << 0,
            FB_DIRECT,
            FB_MATMUL,

#if MEGDNN_AARCH64 || MEGDNN_ARMV7
            ARM_COMMON_DIRECT_STRD1_DOT_INT8X8X32 = 1 << 8,
            ARM_COMMON_DIRECT_STRD2_DOT_INT8X8X32,
            ARM_COMMON_DIRECT_STRD1_DOT_QU8,
            ARM_COMMON_DIRECT_STRD2_DOT_QU8
#endif
        };

        virtual bool usable(ConvolutionBackwardDataImpl* opr,
                            const NCBKernSizeParam& param) const = 0;
        virtual size_t get_workspace(ConvolutionBackwardDataImpl* opr,
                                     const NCBKernSizeParam& param) const = 0;
        virtual ncb_kern_t dispatch_kern(
                ConvolutionBackwardDataImpl* opr,
                const NCBKernSizeParam& param) const = 0;
        bool usable_attribute(ConvolutionBackwardDataImpl* opr,
                              const NCBKernSizeParam& param,
                              const AlgoAttribute& positive_attr =
                                      AlgoAttribute::REPRODUCIBLE,
                              const AlgoAttribute& negative_attr =
                                      AlgoAttribute::DEFAULT) const {
            return contain_attribute_all(positive_attr) &&
                   !contain_attribute_any(negative_attr) && usable(opr, param);
        }
        virtual bool is_preferred(const NCBKernSizeParam&) const {
            return false;
        }
        //! if the algo is naive, it will not split by group
        virtual bool is_naive() const { return false; }
        using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
    };

protected:

    //! default impl calls ncb_1g_dispatch_kern()
    virtual void exec_with_ncb_kern(const NCBKernParam& param);

    //! default impl calls ncb_1g_get_workspace()
    virtual size_t get_workspace_with_ncb(const NCBKernSizeParam& param);

    //! default impl calls ncb_1g_get_all_algorithms()
    virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
            const NCBKernSizeParam& param);

    //! default impl calls ncb_1g_get_algorithm_heuristic()
    virtual Algorithm* get_algorithm_heuristic_with_ncb(
            const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
            const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr);

    //! get kernel pointer for float32 non-contiguous batch 1-group kernel
    virtual ncb_kern_t ncb_1g_dispatch_kern(Algorithm* algo,
                                            const NCBKernSizeParam& param);

    virtual size_t ncb_1g_get_workspace(Algorithm* algo,
                                        const NCBKernSizeParam& param);

    virtual std::vector<Algorithm*> ncb_1g_get_all_algorithms(
            const NCBKernSizeParam& param);

    /*!
     * the default impl iterates over all ncb_1g_get_all_algorithms()
     * and return the first one whose workspace does not exceed the limit.
     */
    virtual Algorithm* ncb_1g_get_algorithm_heuristic(
            const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
            const AlgoAttribute& positive_attr,
            const AlgoAttribute& negative_attr);

    static bool is_matrix_mul_preferred(const NCBKernSizeParam& param);
    /**
     * \brief get all the algorithm for the opr.
     */
    virtual SmallVector<AlgoBase*> get_all_packed_algo();

private:
    NCBKernSizeParam m_prev_selected_algo_sizep;
    Algorithm* m_prev_selected_algo = nullptr;

    //! get algorithm set by user or by heuristic
    Algorithm* get_algorithm(const NCBKernSizeParam& param);

    NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& filter,
                                              const TensorLayout& diff,
                                              const TensorLayout& grad);

    NCBKernParam make_ncb_kern_param(_megdnn_tensor_in filter,
                                     _megdnn_tensor_in diff,
                                     _megdnn_tensor_out grad,
                                     _megdnn_workspace workspace);

    class AlgoNaive;
    class AlgoDirect;
    class AlgoMatrixMul;
    class AlgoPack;
    Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;

public:
    //! maintain all the algos of in the opr of fallback
    static const AlgoPack& algo_pack();
};

}  // namespace fallback
}  // namespace megdnn

//! unpack NCBKernSizeParam into local variables (N, IC, IH, IW, ...)
#define UNPACK_CONV_F32_NCB_KERN_SIZES(_p)                                   \
    auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \
         OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1],           \
         FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1],     \
         SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1],       \
         PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1]

// vim: syntax=cpp.doxygen