Browse Source

refactor(mgb/utils): allow user set different spin time for ASyncSCQueue

GitOrigin-RevId: 1c6c4a7a16
release-1.2
Megvii Engine Team 4 years ago
parent
commit
62637fc46e
6 changed files with 42 additions and 24 deletions
  1. +1
    -1
      src/core/impl/comp_node/cpu/comp_node.cpp
  2. +1
    -1
      src/core/impl/utils/comp_node_sync_manager.cpp
  3. +15
    -11
      src/core/impl/utils/thread.cpp
  4. +6
    -3
      src/core/include/megbrain/utils/thread_impl_0.h
  5. +14
    -3
      src/core/include/megbrain/utils/thread_impl_1.h
  6. +5
    -5
      src/core/test/utils/thread.cpp

+ 1
- 1
src/core/impl/comp_node/cpu/comp_node.cpp View File

@@ -995,7 +995,7 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
}

void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) {
for (size_t i = 0, it = SCQueueSynchronizer::get_default_max_spin() / 20; i < it; ++i) {
if (finished()) {
return;
}


+ 1
- 1
src/core/impl/utils/comp_node_sync_manager.cpp View File

@@ -73,7 +73,7 @@ CompNodeSyncManager& CompNodeSyncManager::busy_wait_set_ready() {
"before actually waiting on a tensor,"
" you must call set_has_waiter first");

size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin();
size_t spin = 0, max_spin = SCQueueSynchronizer::get_default_max_spin();
while (!m_nr_ready.load()) {
++spin;
if (spin >= max_spin) {


+ 15
- 11
src/core/impl/utils/thread.cpp View File

@@ -72,26 +72,28 @@ namespace {
}

/* =============== SCQueueSynchronizer =============== */
size_t SCQueueSynchronizer::cached_max_spin = 0;
size_t SCQueueSynchronizer::cached_default_max_spin = 0;
#ifdef WIN32
bool SCQueueSynchronizer::is_into_atexit = false;
#endif

size_t SCQueueSynchronizer::max_spin() {
if (cached_max_spin)
return cached_max_spin;
size_t SCQueueSynchronizer::get_default_max_spin() {
if (cached_default_max_spin)
return cached_default_max_spin;

if (MGB_GETENV("MGB_WORKER_NO_SLEEP")) {
mgb_log_warn("worker would not sleep");
return cached_max_spin = std::numeric_limits<size_t>::max();
return cached_default_max_spin = std::numeric_limits<size_t>::max();
}

if (auto spin_string = MGB_GETENV("MGB_WORKER_MAX_SPIN")) {
auto spin = std::stoi(spin_string);
mgb_log_warn("worker would execute with spin of %d", spin);
return cached_max_spin = spin;
return cached_default_max_spin = spin;
}

// heuristically, let CPU spinning around 5ms at most before CPU yield.
// we are going to measure how many spins will spent 5ms on current platform.
std::atomic_bool start{false}, stop{false};
size_t cnt;
double cnt_time;
@@ -115,11 +117,13 @@ size_t SCQueueSynchronizer::max_spin() {
}
stop.store(true);
worker.join();
cached_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000);
return cached_max_spin;
cached_default_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000);
return cached_default_max_spin;
}

SCQueueSynchronizer::SCQueueSynchronizer() = default;
SCQueueSynchronizer::SCQueueSynchronizer(size_t max_spin) {
m_max_spin = max_spin;
}

SCQueueSynchronizer::~SCQueueSynchronizer() noexcept {
if (!m_worker_started)
@@ -203,13 +207,13 @@ void SCQueueSynchronizer::producer_wait() {

size_t SCQueueSynchronizer::consumer_fetch(size_t max, size_t min) {
mgb_assert(max >= min && min >= 1);
size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(),
size_t spin = 0,
cur_finished = m_finished_task.load(std::memory_order_relaxed);

// relaxed mem order suffices because acquire would be called for ret
while (m_tot_task.load(std::memory_order_relaxed) < cur_finished + min) {
++ spin;
if (spin >= max_spin) {
if (spin >= m_max_spin) {
while (m_consumer_waiting.test_and_set(std::memory_order_relaxed));
SpinlockReleaser releaser(m_consumer_waiting);



+ 6
- 3
src/core/include/megbrain/utils/thread_impl_0.h View File

@@ -46,15 +46,18 @@ namespace mgb {

class SCQueueSynchronizer {
public:
static size_t max_spin() {
return 0;
}
SCQueueSynchronizer(size_t max_spin) {}
static size_t get_default_max_spin() { return 0; }
};

// tasks would be dispatched inplace
template<typename Param, class TaskImpl>
class AsyncQueueSC: public NonCopyableObj {
public:
AsyncQueueSC() {}
AsyncQueueSC(size_t max_spin) {}

virtual ~AsyncQueueSC() = default;

void add_task(const Param &param) {


+ 14
- 3
src/core/include/megbrain/utils/thread_impl_1.h View File

@@ -50,7 +50,11 @@ namespace mgb {
* wrap around within a practical time, which would crash the system.
*/
class SCQueueSynchronizer {
static size_t cached_max_spin;
//! cached value for global default max spin, read and stored by get_default_max_spin
static size_t cached_default_max_spin;

//! synchronizer wait at most m_max_spin before CPU yield
size_t m_max_spin;
std::atomic_flag m_consumer_waiting = ATOMIC_FLAG_INIT;
std::atomic_bool m_should_exit{false};
bool m_worker_started = false, m_wait_finish_called = false;
@@ -65,7 +69,8 @@ namespace mgb {
std::thread m_worker_thread;

public:
SCQueueSynchronizer();
SCQueueSynchronizer(size_t max_spin);

~SCQueueSynchronizer() noexcept;

bool worker_started() const {
@@ -79,7 +84,8 @@ namespace mgb {
}
#endif

static size_t max_spin();
//! get global default max spin from env
static size_t get_default_max_spin();

void start_worker(std::thread thread);

@@ -150,6 +156,11 @@ namespace mgb {
};

public:
AsyncQueueSC() : m_synchronizer(SCQueueSynchronizer::get_default_max_spin()) {}

//! specify max spin manually, caller must ensure the given value is optimal,
//! otherwise caller should leave the value adjustable by user.
AsyncQueueSC(size_t max_spin) : m_synchronizer(max_spin) {}
#ifdef WIN32
bool check_is_into_atexit() {
if (SCQueueSynchronizer::is_into_atexit) {


+ 5
- 5
src/core/test/utils/thread.cpp View File

@@ -43,7 +43,7 @@ namespace {
template<int producer_sleep, int consumer_sleep>
void test_scq_sync_multi_producer() {
size_t nr_worker_call = 0;
SCQueueSynchronizer sync;
SCQueueSynchronizer sync(0);
auto worker = [&]() {
RNGxorshf rng{next_rand_seed()};
while (auto nr = sync.consumer_fetch(1)) {
@@ -87,7 +87,7 @@ namespace {

TEST(TestAsyncQueue, Synchronizer) {
size_t nr_worker_call = 0;
SCQueueSynchronizer sync;
SCQueueSynchronizer sync(0);
auto worker = [&]() {
for (; ;) {
auto nr = sync.consumer_fetch(1);
@@ -115,7 +115,7 @@ TEST(TestAsyncQueue, Synchronizer) {
TEST(TestAsyncQueue, SynchronizerWaitOverhead) {
{
size_t nr_worker_call = 0;
SCQueueSynchronizer sync;
SCQueueSynchronizer sync(0);
auto worker = [&]() {
for (;;) {
auto nr = sync.consumer_fetch(1);
@@ -141,7 +141,7 @@ TEST(TestAsyncQueue, SynchronizerWaitOverhead) {
double worker_time = 0, avg_await;
{
size_t nr_worker_call = 0;
SCQueueSynchronizer sync;
SCQueueSynchronizer sync(0);
auto worker = [&]() {
for (;;) {
auto nr = sync.consumer_fetch(1);
@@ -188,7 +188,7 @@ TEST(TestAsyncQueue, SynchronizerMultiProducer3) {
}

TEST(TestAsyncQueue, SynchronizerWaiterStarving) {
SCQueueSynchronizer sync;
SCQueueSynchronizer sync(0);
std::atomic_size_t processed{0};
auto worker = [&]() {
while (sync.consumer_fetch(1)) {


Loading…
Cancel
Save