refactor(mgb/utils): allow user set different spin time for ASyncSCQueue

GitOrigin-RevId: 1c6c4a7a16
4 years ago · 62637fc46e
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -995,7 +995,7 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
 }

 void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
    for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) {
    for (size_t i = 0, it = SCQueueSynchronizer::get_default_max_spin() / 20; i < it; ++i) {
        if (finished()) {
            return;
        }
--- a/src/core/impl/utils/comp_node_sync_manager.cpp
+++ b/src/core/impl/utils/comp_node_sync_manager.cpp
@@ -73,7 +73,7 @@ CompNodeSyncManager& CompNodeSyncManager::busy_wait_set_ready() {
               "before actually waiting on a tensor,"
               " you must call set_has_waiter first");

    size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin();
    size_t spin = 0, max_spin = SCQueueSynchronizer::get_default_max_spin();
    while (!m_nr_ready.load()) {
        ++spin;
        if (spin >= max_spin) {
--- a/src/core/impl/utils/thread.cpp
+++ b/src/core/impl/utils/thread.cpp
@@ -72,26 +72,28 @@ namespace {
 }

 /* =============== SCQueueSynchronizer ===============  */
 size_t SCQueueSynchronizer::cached_max_spin = 0;
 size_t SCQueueSynchronizer::cached_default_max_spin = 0;
 #ifdef WIN32
 bool SCQueueSynchronizer::is_into_atexit = false;
 #endif

 size_t SCQueueSynchronizer::max_spin() {
    if (cached_max_spin)
        return cached_max_spin;
 size_t SCQueueSynchronizer::get_default_max_spin() {
    if (cached_default_max_spin)
        return cached_default_max_spin;

    if (MGB_GETENV("MGB_WORKER_NO_SLEEP")) {
        mgb_log_warn("worker would not sleep");
        return cached_max_spin = std::numeric_limits<size_t>::max();
        return cached_default_max_spin = std::numeric_limits<size_t>::max();
    }

    if (auto spin_string = MGB_GETENV("MGB_WORKER_MAX_SPIN")) {
        auto spin = std::stoi(spin_string);
        mgb_log_warn("worker would execute with spin of %d", spin);
        return cached_max_spin = spin;
        return cached_default_max_spin = spin;
    }

    // heuristically, let CPU spinning around 5ms at most before CPU yield.
    // we are going to measure how many spins will spent 5ms on current platform.
    std::atomic_bool start{false}, stop{false};
    size_t cnt;
    double cnt_time;
@@ -115,11 +117,13 @@ size_t SCQueueSynchronizer::max_spin() {
    }
    stop.store(true);
    worker.join();
    cached_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000);
    return cached_max_spin;
    cached_default_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000);
    return cached_default_max_spin;
 }

 SCQueueSynchronizer::SCQueueSynchronizer() = default;
 SCQueueSynchronizer::SCQueueSynchronizer(size_t max_spin) {
    m_max_spin = max_spin;
 }

 SCQueueSynchronizer::~SCQueueSynchronizer() noexcept {
    if (!m_worker_started)
@@ -203,13 +207,13 @@ void SCQueueSynchronizer::producer_wait() {

 size_t SCQueueSynchronizer::consumer_fetch(size_t max, size_t min) {
    mgb_assert(max >= min && min >= 1);
    size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(),
    size_t spin = 0,
           cur_finished = m_finished_task.load(std::memory_order_relaxed);

    // relaxed mem order suffices because acquire would be called for ret
    while (m_tot_task.load(std::memory_order_relaxed) < cur_finished + min) {
        ++ spin;
        if (spin >= max_spin) {
        if (spin >= m_max_spin) {
            while (m_consumer_waiting.test_and_set(std::memory_order_relaxed));
            SpinlockReleaser releaser(m_consumer_waiting);

--- a/src/core/include/megbrain/utils/thread_impl_0.h
+++ b/src/core/include/megbrain/utils/thread_impl_0.h
@@ -46,15 +46,18 @@ namespace mgb {

    class SCQueueSynchronizer {
        public:
            static size_t max_spin() {
                return 0;
            }
            SCQueueSynchronizer(size_t max_spin) {}

            static size_t get_default_max_spin() { return 0; }
    };

    // tasks would be dispatched inplace
    template<typename Param, class TaskImpl>
    class AsyncQueueSC: public NonCopyableObj {
        public:
            AsyncQueueSC() {}
            AsyncQueueSC(size_t max_spin) {}

            virtual ~AsyncQueueSC() = default;

            void add_task(const Param &param) {
--- a/src/core/include/megbrain/utils/thread_impl_1.h
+++ b/src/core/include/megbrain/utils/thread_impl_1.h
@@ -50,7 +50,11 @@ namespace mgb {
     * wrap around within a practical time, which would crash the system.
     */
    class SCQueueSynchronizer {
        static size_t cached_max_spin;
        //! cached value for global default max spin, read and stored by get_default_max_spin
        static size_t cached_default_max_spin;

        //! synchronizer wait at most m_max_spin before CPU yield
        size_t m_max_spin;
        std::atomic_flag m_consumer_waiting = ATOMIC_FLAG_INIT;
        std::atomic_bool m_should_exit{false};
        bool m_worker_started = false, m_wait_finish_called = false;
@@ -65,7 +69,8 @@ namespace mgb {
        std::thread m_worker_thread;

        public:
            SCQueueSynchronizer();
            SCQueueSynchronizer(size_t max_spin);

            ~SCQueueSynchronizer() noexcept;

            bool worker_started() const {
@@ -79,7 +84,8 @@ namespace mgb {
            }
 #endif

            static size_t max_spin();
            //! get global default max spin from env
            static size_t get_default_max_spin();

            void start_worker(std::thread thread);

@@ -150,6 +156,11 @@ namespace mgb {
        };

        public:
            AsyncQueueSC() : m_synchronizer(SCQueueSynchronizer::get_default_max_spin()) {}

            //! specify max spin manually, caller must ensure the given value is optimal,
            //! otherwise caller should leave the value adjustable by user.
            AsyncQueueSC(size_t max_spin) : m_synchronizer(max_spin) {}
 #ifdef WIN32
            bool check_is_into_atexit() {
                if (SCQueueSynchronizer::is_into_atexit) {
--- a/src/core/test/utils/thread.cpp
+++ b/src/core/test/utils/thread.cpp
@@ -43,7 +43,7 @@ namespace {
    template<int producer_sleep, int consumer_sleep>
    void test_scq_sync_multi_producer() {
        size_t nr_worker_call = 0;
        SCQueueSynchronizer sync;
        SCQueueSynchronizer sync(0);
        auto worker = [&]() {
            RNGxorshf rng{next_rand_seed()};
            while (auto nr = sync.consumer_fetch(1)) {
@@ -87,7 +87,7 @@ namespace {

 TEST(TestAsyncQueue, Synchronizer) {
    size_t nr_worker_call = 0;
    SCQueueSynchronizer sync;
    SCQueueSynchronizer sync(0);
    auto worker = [&]() {
        for (; ;) {
            auto nr = sync.consumer_fetch(1);
@@ -115,7 +115,7 @@ TEST(TestAsyncQueue, Synchronizer) {
 TEST(TestAsyncQueue, SynchronizerWaitOverhead) {
    {
        size_t nr_worker_call = 0;
        SCQueueSynchronizer sync;
        SCQueueSynchronizer sync(0);
        auto worker = [&]() {
            for (;;) {
                auto nr = sync.consumer_fetch(1);
@@ -141,7 +141,7 @@ TEST(TestAsyncQueue, SynchronizerWaitOverhead) {
        double worker_time = 0, avg_await;
        {
            size_t nr_worker_call = 0;
            SCQueueSynchronizer sync;
            SCQueueSynchronizer sync(0);
            auto worker = [&]() {
                for (;;) {
                    auto nr = sync.consumer_fetch(1);
@@ -188,7 +188,7 @@ TEST(TestAsyncQueue, SynchronizerMultiProducer3) {
 }

 TEST(TestAsyncQueue, SynchronizerWaiterStarving) {
    SCQueueSynchronizer sync;
    SCQueueSynchronizer sync(0);
    std::atomic_size_t processed{0};
    auto worker = [&]() {
        while (sync.consumer_fetch(1)) {