fix(mgb/core): fix multi thread pool deactive and multi thread conflict

GitOrigin-RevId: 36787a08a5
4 years ago · 09b5f3d434
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -63,13 +63,12 @@ class CpuCompNode::WorkerQueue final
 #endif
        }
        sys::set_thread_name(m_locator.to_string());
        if(m_thread_pool)
            m_thread_pool->active();
    }

    void on_sync_all_task_finish() override {
        if (m_thread_pool)
        if (m_thread_pool) {
            m_thread_pool->deactive();
        }
    }

 public:
@@ -436,6 +435,8 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
            }
        }

        ThreadPool* get_thread_pool() const { return m_thread_pool.get(); }

        void* mgb_aligned_alloc(size_t size) {
            auto alignment = get_mem_addr_alignment();
 #ifdef WIN32
@@ -546,6 +547,9 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
            } else if (m_worker_queue) {
                m_worker_queue->wait_all_task_finish();
            }
            if (m_thread_pool) {
                m_thread_pool->deactive();
            }
        }

        void dispatch(Task &&task) override {
@@ -893,6 +897,11 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
 void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
    for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) {
        if (finished()) {
            auto thread_pool = static_cast<CpuCompNodeImpl*>(m_comp_node_impl)
                                       ->get_thread_pool();
            if (thread_pool) {
                thread_pool->deactive();
            }
            return;
        }
    }
@@ -906,6 +915,11 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
        m_dev_wait_cv.wait(lock);
    }
    m_dev_wait_nr_waiter.fetch_sub(1, std::memory_order_release);
    auto thread_pool =
            static_cast<CpuCompNodeImpl*>(m_comp_node_impl)->get_thread_pool();
    if (thread_pool) {
        thread_pool->deactive();
    }
 }

 CpuCompNode::CpuDispatchableBase::EventImpl::~EventImpl() noexcept {
--- a/src/core/impl/utils/thread_pool.cpp
+++ b/src/core/impl/utils/thread_pool.cpp
@@ -74,6 +74,7 @@ void ThreadPool::add_task(const TaskElem& task_elem) {
    //! Make sure the main thread have bind
    if (m_main_affinity_flag &&
        m_core_binding_function != nullptr) {
        std::lock_guard<std::mutex> lock(m_mutex_task);
        m_core_binding_function(m_nr_threads - 1);
        m_main_affinity_flag = false;
    }
@@ -85,10 +86,10 @@ void ThreadPool::add_task(const TaskElem& task_elem) {
        }
        return;
    } else {
        std::lock_guard<std::mutex> lock(m_mutex_task);
        mgb_assert(m_task_iter.load(std::memory_order_acquire) <= 0,
                   "The init value of m_all_sub_task is not zero.");
        active();
        std::lock_guard<std::mutex> lock(m_mutex_task);
        //! Set the task number, task iter and task
        m_nr_parallelism = parallelism;
        m_task_iter.exchange(parallelism, std::memory_order_relaxed);
@@ -113,6 +114,7 @@ void ThreadPool::add_task(const TaskElem& task_elem) {

 void ThreadPool::set_affinity(AffinityCallBack affinity_cb) {
    mgb_assert(affinity_cb, "The affinity callback must not be nullptr");
    std::lock_guard<std::mutex> lock(m_mutex_task);
    m_core_binding_function = affinity_cb;
    for (size_t i = 0; i < m_nr_threads - 1; i++) {
        m_workers[i]->affinity_flag = true;
@@ -147,10 +149,12 @@ void ThreadPool::active() {
    }
 }
 void ThreadPool::deactive() {
    std::lock_guard<std::mutex> lock_task(m_mutex_task);
    std::unique_lock<std::mutex> lock(m_mutex);
    m_active = false;
 }
 ThreadPool::~ThreadPool() {
    std::lock_guard<std::mutex> lock_task(m_mutex_task);
    {
        std::unique_lock<std::mutex> lock(m_mutex);
        m_stop = true;
--- a/src/core/include/megbrain/utils/thread_pool.h
+++ b/src/core/include/megbrain/utils/thread_pool.h
@@ -80,7 +80,7 @@ public:
    ~ThreadPool();

 private:
    size_t m_nr_threads = 0;
    const size_t m_nr_threads = 0;
    //! Indicate whether the main thread have binding
    bool m_main_affinity_flag;
    //! The callback binding the threads to cores
--- a/src/core/test/utils/thread_pool.cpp
+++ b/src/core/test/utils/thread_pool.cpp
@@ -12,6 +12,8 @@
 #include "megbrain/comp_node.h"
 #include "megbrain/system.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/utility.h"
 #include <atomic>
 #include <random>

@@ -59,6 +61,73 @@ TEST(TestThreadPool, BASIC) {
        ASSERT_EQ(dst1[i], truth[i]);
    }
 }

 TEST(TestGraph, ParallelRunMultithreadMode) {
    // check race conditions when graphs are executed on multple threads
    std::atomic_size_t sync_counter{0};
    constexpr size_t NR_RUN = 50;
    size_t nr_worker = std::max(4, sys::get_cpu_count() / 4);
    if (auto setting = MGB_GETENV("TestGraphParallelRun_nr_worker")) {
        nr_worker = std::stoul(setting);
    }
    mgb_log("use %zu workers", nr_worker);

    auto sync_barrier = [&sync_counter, nr_worker](size_t& cnt) {
        ++sync_counter;
        ++cnt;
        while (sync_counter < cnt * nr_worker)
            ;
    };

    auto do_worker = [&sync_barrier](size_t sync_cnt) {
        auto cn = CompNode::load("multithread2:0");
        HostTensorGenerator<> gen;
        auto host_x = gen({23}, cn);
        HostTensorND host_y, y_expect;
        y_expect.copy_from(*host_x);
        {
            auto py = y_expect.ptr<float>();
            for (int i = 0; i < 23; ++i) {
                for (int j = 0; j < 5; ++j) {
                    py[i] = py[i] * 2 + 3;
                }
            }
        }

        sync_barrier(sync_cnt);
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x), y = x;
        for (int i = 0; i < 5; ++i) {
            y = y * 2 + 3;
        }

        sync_barrier(sync_cnt);
        auto func = graph->compile({make_callback_copy(y, host_y)});

        sync_barrier(sync_cnt);
        func->execute();
        MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
        memset(host_y.raw_ptr(), -1, 23 * sizeof(float));

        sync_barrier(sync_cnt);
        func->execute();
        MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
        func->wait();
    };
    auto worker = [&]() {
        size_t scnt = 0;
        for (size_t run_id = 0; run_id < NR_RUN; ++run_id) {
            do_worker(scnt);
        }
    };

    std::vector<std::thread> workers;
    for (size_t i = 0; i < nr_worker; ++i)
        workers.emplace_back(worker);

    for (auto&& i : workers)
        i.join();
 }
 #else
 #pragma message "tests are disabled as thread is not enabled."
 #endif  //  MGB_HAVE_THREAD