OpenI
/
MegEngine

#pragma once

#include "megdnn/basic_types.h"
#include "megdnn/handle.h"
#include "src/common/utils.h"

#include <gtest/gtest.h>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <memory>

#if MEGDNN_ENABLE_MULTI_THREADS
#include <atomic>
#endif

#define megcore_check(x)                                                        \
    do {                                                                        \
        auto status = (x);                                                      \
        if (status != megcoreSuccess) {                                         \
            std::cerr << "megcore_check error: " << megcoreGetErrorName(status) \
                      << std::endl;                                             \
            megdnn_trap();                                                      \
        }                                                                       \
    } while (0)

namespace megdnn {
namespace test {

struct TaskExecutorConfig {
    //! Number of threads.
    size_t nr_thread;
    //! The core id to bind. The size of affinity_core_set should be equal to
    //! nr_thread.
    std::vector<size_t> affinity_core_set;
};

class CpuDispatchChecker final : MegcoreCPUDispatcher {
    class TaskExecutor {
        using Task = megcore::CPUDispatcher::Task;
        using MultiThreadingTask = megcore::CPUDispatcher::MultiThreadingTask;
#if MEGDNN_ENABLE_MULTI_THREADS
#if defined(WIN32)
        using thread_affinity_type = DWORD;
#else  // not WIN32
#if defined(__APPLE__)
        using thread_affinity_type = int;
#else
        using thread_affinity_type = cpu_set_t;
#endif
#endif
#endif

    public:
        TaskExecutor(TaskExecutorConfig* config = nullptr);
        ~TaskExecutor();
        /*!
         * Sync all workers.
         */
        void sync();
        /*!
         * Number of threads in this thread pool, including the main thread.
         */
        size_t nr_threads() const { return m_nr_threads; }
        void add_task(const MultiThreadingTask& task, size_t parallelism);
        void add_task(const Task& task);

    private:
#if MEGDNN_ENABLE_MULTI_THREADS
        size_t m_all_task_iter = 0;
        std::atomic_int m_current_task_iter{0};

        //! Indicate whether the thread should work, used for main thread sync
        std::vector<std::atomic_bool*> m_workers_flag;

        //! Whether the main thread affinity has been set.
        bool m_main_thread_affinity = false;

        //! Stop the worker threads.
        bool m_stop{false};

        MultiThreadingTask m_task;

        //! The cpuids to be bound.
        //! If the m_cpu_ids is empty, then none of the threads will be bound to
        //! cpus, else the size of m_cpu_ids should equal to m_nr_threads.
        std::vector<size_t> m_cpu_ids;

        //! The previous affinity mask of the main thread.
        thread_affinity_type m_main_thread_prev_affinity_mask;

        std::vector<std::thread> m_workers;
#endif
        //! Total number of threads, including main thread.
        size_t m_nr_threads = 1;
    };

    //! track number of CpuDispatchChecker instances to avoid leaking
    class InstCounter {
        bool m_used = false;
        int m_cnt = 0, m_max_cnt = 0;

    public:
        ~InstCounter() {
            auto check = [this]() {
                ASSERT_NE(0, m_max_cnt) << "no kernel dispatched on CPU";
                ASSERT_EQ(0, m_cnt) << "leaked CpuDispatchChecker object";
            };
            if (m_used) {
                check();
            }
        }
        int& cnt() {
            m_used = true;
            m_max_cnt = std::max(m_cnt, m_max_cnt);
            return m_cnt;
        }
    };
    static InstCounter sm_inst_counter;
    bool m_recursive_dispatch = false;
#if MEGDNN_ENABLE_MULTI_THREADS
    std::atomic_size_t m_nr_call{0};
#else
    size_t m_nr_call = 0;
#endif

    std::unique_ptr<TaskExecutor> m_task_executor;

    CpuDispatchChecker(TaskExecutorConfig* config) {
        ++sm_inst_counter.cnt();
        megdnn_assert(sm_inst_counter.cnt() < 10);
        m_task_executor = std::make_unique<TaskExecutor>(config);
    }

    void dispatch(Task&& task) override {
        megdnn_assert(!m_recursive_dispatch);
        m_recursive_dispatch = true;
        ++m_nr_call;
        m_task_executor->add_task(std::move(task));
        m_recursive_dispatch = false;
    }

    void dispatch(MultiThreadingTask&& task, size_t parallelism) override {
        megdnn_assert(!m_recursive_dispatch);
        m_recursive_dispatch = true;
        ++m_nr_call;
        m_task_executor->add_task(std::move(task), parallelism);
        m_recursive_dispatch = false;
    }

    size_t nr_threads() override { return m_task_executor->nr_threads(); }

    CpuDispatchChecker() {
        ++sm_inst_counter.cnt();
        megdnn_assert(sm_inst_counter.cnt() < 10);
    }

    void sync() override {}

public:
    ~CpuDispatchChecker() {
        if (!std::uncaught_exception()) {
            megdnn_assert(!m_recursive_dispatch);
        } else {
            if (m_recursive_dispatch) {
                fprintf(stderr,
                        "CpuDispatchChecker: "
                        "detected recursive dispatch\n");
            }
            if (!m_nr_call) {
                fprintf(stderr, "CpuDispatchChecker: dispatch not called\n");
            }
        }
        --sm_inst_counter.cnt();
    }

    static std::unique_ptr<MegcoreCPUDispatcher> make(TaskExecutorConfig* config) {
        return std::unique_ptr<MegcoreCPUDispatcher>(new CpuDispatchChecker(config));
    }
};

std::unique_ptr<Handle> create_cpu_handle(
        int debug_level, bool check_dispatch = true,
        TaskExecutorConfig* config = nullptr);

std::unique_ptr<Handle> create_cpu_handle_with_dispatcher(
        int debug_level, const std::shared_ptr<MegcoreCPUDispatcher>& dispatcher);

static inline dt_float32 diff(dt_float32 x, dt_float32 y) {
    auto numerator = x - y;
    auto denominator = std::max(std::max(std::abs(x), std::abs(y)), 1.f);
    return numerator / denominator;
}

static inline int diff(int x, int y) {
    return x - y;
}

static inline int diff(dt_quint8 x, dt_quint8 y) {
    return x.as_uint8() - y.as_uint8();
}

static inline int diff(dt_qint32 x, dt_qint32 y) {
    return x.as_int32() - y.as_int32();
}

static inline int diff(dt_qint16 x, dt_qint16 y) {
    return x.as_int16() - y.as_int16();
}

static inline int diff(dt_qint8 x, dt_qint8 y) {
    return x.as_int8() - y.as_int8();
}

static inline int diff(dt_qint4 x, dt_qint4 y) {
    return x.as_int8() - y.as_int8();
}

static inline int diff(dt_qint1 x, dt_qint1 y) {
    return x.as_int8() - y.as_int8();
}

static inline int diff(dt_quint4 x, dt_quint4 y) {
    return x.as_uint8() - y.as_uint8();
}

inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) {
    megdnn_assert(shape.ndim == 4);
    auto N = shape[0], C = shape[1], H = shape[2], W = shape[3];
    return TensorShape{N, H, W, C};
}

inline TensorShape cvt_src_or_dst_ncdhw2ndhwc(const TensorShape& shape) {
    megdnn_assert(shape.ndim == 5);
    auto N = shape[0], C = shape[1], D = shape[2], H = shape[3], W = shape[4];
    return TensorShape{N, D, H, W, C};
}

inline TensorShape cvt_filter_nchw2nhwc(const TensorShape& shape) {
    if (shape.ndim == 4) {
        auto OC = shape[0], IC = shape[1], FH = shape[2], FW = shape[3];
        return TensorShape{OC, FH, FW, IC};
    } else {
        megdnn_assert(shape.ndim == 5);
        auto G = shape[0], OC = shape[1], IC = shape[2], FH = shape[3], FW = shape[4];
        return TensorShape{G, OC, FH, FW, IC};
    }
}

inline TensorShape cvt_filter_ncdhw2ndhwc(const TensorShape& shape) {
    if (shape.ndim == 5) {
        auto OC = shape[0], IC = shape[1], FD = shape[2], FH = shape[3], FW = shape[4];
        return TensorShape{OC, FD, FH, FW, IC};
    } else {
        megdnn_assert(shape.ndim == 6);
        auto G = shape[0], OC = shape[1], IC = shape[2], FD = shape[3], FH = shape[4],
             FW = shape[5];
        return TensorShape{G, OC, FD, FH, FW, IC};
    }
}

void megdnn_sync(Handle* handle);
void* megdnn_malloc(Handle* handle, size_t size_in_bytes);
void megdnn_free(Handle* handle, void* ptr);
void megdnn_memcpy_D2H(
        Handle* handle, void* dst, const void* src, size_t size_in_bytes);
void megdnn_memcpy_H2D(
        Handle* handle, void* dst, const void* src, size_t size_in_bytes);
void megdnn_memcpy_D2D(
        Handle* handle, void* dst, const void* src, size_t size_in_bytes);

//! default implementation for DynOutMallocPolicy
class DynOutMallocPolicyImpl final : public DynOutMallocPolicy {
    Handle* m_handle;

public:
    DynOutMallocPolicyImpl(Handle* handle) : m_handle{handle} {}

    TensorND alloc_output(
            size_t id, DType dtype, const TensorShape& shape, void* user_data) override;
    void* alloc_workspace(size_t sz, void* user_data) override;
    void free_workspace(void* ptr, void* user_data) override;

    /*!
     * \brief make a shared_ptr which would release output memory when
     *      deleted
     * \param out output tensor allocated by alloc_output()
     */
    std::shared_ptr<void> make_output_refholder(const TensorND& out);
};

//! replace ErrorHandler::on_megdnn_error
class MegDNNError : public std::exception {
    std::string m_msg;

public:
    MegDNNError(const std::string& msg) : m_msg{msg} {}

    const char* what() const noexcept { return m_msg.c_str(); }
};
class TensorReshapeError : public MegDNNError {
public:
    using MegDNNError::MegDNNError;
};

size_t get_cpu_count();

static inline bool good_float(float val) {
    return std::isfinite(val);
}

static inline bool good_float(int) {
    return true;
}

static inline bool good_float(dt_qint8) {
    return true;
}

static inline bool good_float(dt_qint16) {
    return true;
}

static inline bool good_float(dt_quint8) {
    return true;
}

static inline bool good_float(dt_qint32) {
    return true;
}

static inline bool good_float(dt_qint4) {
    return true;
}

static inline bool good_float(dt_qint1) {
    return true;
}

static inline bool good_float(dt_quint4) {
    return true;
}

// A hack for the (x+0) promote to int trick on dt_quint8.
static inline int operator+(dt_quint8 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_uint8();
}

static inline int operator+(dt_qint32 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_int32();
}

static inline int operator+(dt_qint8 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return int8_t(lhs);
}

static inline int operator+(dt_qint16 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_int16();
}

static inline int operator+(dt_quint4 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_uint8();
}

static inline int operator+(dt_qint4 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_int8();
}

static inline int operator+(dt_qint1 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_int8();
}
}  // namespace test

static inline bool operator==(const TensorLayout& a, const TensorLayout& b) {
    return a.eq_layout(b);
}

static inline std::ostream& operator<<(std::ostream& ostr, const TensorLayout& layout) {
    return ostr << layout.to_string();
}

//! change the image2d_pitch_alignment of naive handle in this scope
class NaivePitchAlignmentScope {
    size_t m_orig_val, m_new_val;
    megdnn::Handle::HandleVendorType m_orig_vendor, m_new_vendor;

public:
    NaivePitchAlignmentScope(size_t alignment, megdnn::Handle::HandleVendorType vendor);
    ~NaivePitchAlignmentScope();
};

}  // namespace megdnn

// vim: syntax=cpp.doxygen