GitOrigin-RevId: 1c6c4a7a16
release-1.2
@@ -995,7 +995,7 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() { | |||
} | |||
void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() { | |||
for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) { | |||
for (size_t i = 0, it = SCQueueSynchronizer::get_default_max_spin() / 20; i < it; ++i) { | |||
if (finished()) { | |||
return; | |||
} | |||
@@ -73,7 +73,7 @@ CompNodeSyncManager& CompNodeSyncManager::busy_wait_set_ready() { | |||
"before actually waiting on a tensor," | |||
" you must call set_has_waiter first"); | |||
size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(); | |||
size_t spin = 0, max_spin = SCQueueSynchronizer::get_default_max_spin(); | |||
while (!m_nr_ready.load()) { | |||
++spin; | |||
if (spin >= max_spin) { | |||
@@ -72,26 +72,28 @@ namespace { | |||
} | |||
/* =============== SCQueueSynchronizer =============== */ | |||
size_t SCQueueSynchronizer::cached_max_spin = 0; | |||
size_t SCQueueSynchronizer::cached_default_max_spin = 0; | |||
#ifdef WIN32 | |||
bool SCQueueSynchronizer::is_into_atexit = false; | |||
#endif | |||
size_t SCQueueSynchronizer::max_spin() { | |||
if (cached_max_spin) | |||
return cached_max_spin; | |||
size_t SCQueueSynchronizer::get_default_max_spin() { | |||
if (cached_default_max_spin) | |||
return cached_default_max_spin; | |||
if (MGB_GETENV("MGB_WORKER_NO_SLEEP")) { | |||
mgb_log_warn("worker would not sleep"); | |||
return cached_max_spin = std::numeric_limits<size_t>::max(); | |||
return cached_default_max_spin = std::numeric_limits<size_t>::max(); | |||
} | |||
if (auto spin_string = MGB_GETENV("MGB_WORKER_MAX_SPIN")) { | |||
auto spin = std::stoi(spin_string); | |||
mgb_log_warn("worker would execute with spin of %d", spin); | |||
return cached_max_spin = spin; | |||
return cached_default_max_spin = spin; | |||
} | |||
// heuristically, let CPU spinning around 5ms at most before CPU yield. | |||
// we are going to measure how many spins will spent 5ms on current platform. | |||
std::atomic_bool start{false}, stop{false}; | |||
size_t cnt; | |||
double cnt_time; | |||
@@ -115,11 +117,13 @@ size_t SCQueueSynchronizer::max_spin() { | |||
} | |||
stop.store(true); | |||
worker.join(); | |||
cached_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000); | |||
return cached_max_spin; | |||
cached_default_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000); | |||
return cached_default_max_spin; | |||
} | |||
SCQueueSynchronizer::SCQueueSynchronizer() = default; | |||
SCQueueSynchronizer::SCQueueSynchronizer(size_t max_spin) { | |||
m_max_spin = max_spin; | |||
} | |||
SCQueueSynchronizer::~SCQueueSynchronizer() noexcept { | |||
if (!m_worker_started) | |||
@@ -203,13 +207,13 @@ void SCQueueSynchronizer::producer_wait() { | |||
size_t SCQueueSynchronizer::consumer_fetch(size_t max, size_t min) { | |||
mgb_assert(max >= min && min >= 1); | |||
size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(), | |||
size_t spin = 0, | |||
cur_finished = m_finished_task.load(std::memory_order_relaxed); | |||
// relaxed mem order suffices because acquire would be called for ret | |||
while (m_tot_task.load(std::memory_order_relaxed) < cur_finished + min) { | |||
++ spin; | |||
if (spin >= max_spin) { | |||
if (spin >= m_max_spin) { | |||
while (m_consumer_waiting.test_and_set(std::memory_order_relaxed)); | |||
SpinlockReleaser releaser(m_consumer_waiting); | |||
@@ -46,15 +46,18 @@ namespace mgb { | |||
class SCQueueSynchronizer { | |||
public: | |||
static size_t max_spin() { | |||
return 0; | |||
} | |||
SCQueueSynchronizer(size_t max_spin) {} | |||
static size_t get_default_max_spin() { return 0; } | |||
}; | |||
// tasks would be dispatched inplace | |||
template<typename Param, class TaskImpl> | |||
class AsyncQueueSC: public NonCopyableObj { | |||
public: | |||
AsyncQueueSC() {} | |||
AsyncQueueSC(size_t max_spin) {} | |||
virtual ~AsyncQueueSC() = default; | |||
void add_task(const Param ¶m) { | |||
@@ -50,7 +50,11 @@ namespace mgb { | |||
* wrap around within a practical time, which would crash the system. | |||
*/ | |||
class SCQueueSynchronizer { | |||
static size_t cached_max_spin; | |||
//! cached value for global default max spin, read and stored by get_default_max_spin | |||
static size_t cached_default_max_spin; | |||
//! synchronizer wait at most m_max_spin before CPU yield | |||
size_t m_max_spin; | |||
std::atomic_flag m_consumer_waiting = ATOMIC_FLAG_INIT; | |||
std::atomic_bool m_should_exit{false}; | |||
bool m_worker_started = false, m_wait_finish_called = false; | |||
@@ -65,7 +69,8 @@ namespace mgb { | |||
std::thread m_worker_thread; | |||
public: | |||
SCQueueSynchronizer(); | |||
SCQueueSynchronizer(size_t max_spin); | |||
~SCQueueSynchronizer() noexcept; | |||
bool worker_started() const { | |||
@@ -79,7 +84,8 @@ namespace mgb { | |||
} | |||
#endif | |||
static size_t max_spin(); | |||
//! get global default max spin from env | |||
static size_t get_default_max_spin(); | |||
void start_worker(std::thread thread); | |||
@@ -150,6 +156,11 @@ namespace mgb { | |||
}; | |||
public: | |||
AsyncQueueSC() : m_synchronizer(SCQueueSynchronizer::get_default_max_spin()) {} | |||
//! specify max spin manually, caller must ensure the given value is optimal, | |||
//! otherwise caller should leave the value adjustable by user. | |||
AsyncQueueSC(size_t max_spin) : m_synchronizer(max_spin) {} | |||
#ifdef WIN32 | |||
bool check_is_into_atexit() { | |||
if (SCQueueSynchronizer::is_into_atexit) { | |||
@@ -43,7 +43,7 @@ namespace { | |||
template<int producer_sleep, int consumer_sleep> | |||
void test_scq_sync_multi_producer() { | |||
size_t nr_worker_call = 0; | |||
SCQueueSynchronizer sync; | |||
SCQueueSynchronizer sync(0); | |||
auto worker = [&]() { | |||
RNGxorshf rng{next_rand_seed()}; | |||
while (auto nr = sync.consumer_fetch(1)) { | |||
@@ -87,7 +87,7 @@ namespace { | |||
TEST(TestAsyncQueue, Synchronizer) { | |||
size_t nr_worker_call = 0; | |||
SCQueueSynchronizer sync; | |||
SCQueueSynchronizer sync(0); | |||
auto worker = [&]() { | |||
for (; ;) { | |||
auto nr = sync.consumer_fetch(1); | |||
@@ -115,7 +115,7 @@ TEST(TestAsyncQueue, Synchronizer) { | |||
TEST(TestAsyncQueue, SynchronizerWaitOverhead) { | |||
{ | |||
size_t nr_worker_call = 0; | |||
SCQueueSynchronizer sync; | |||
SCQueueSynchronizer sync(0); | |||
auto worker = [&]() { | |||
for (;;) { | |||
auto nr = sync.consumer_fetch(1); | |||
@@ -141,7 +141,7 @@ TEST(TestAsyncQueue, SynchronizerWaitOverhead) { | |||
double worker_time = 0, avg_await; | |||
{ | |||
size_t nr_worker_call = 0; | |||
SCQueueSynchronizer sync; | |||
SCQueueSynchronizer sync(0); | |||
auto worker = [&]() { | |||
for (;;) { | |||
auto nr = sync.consumer_fetch(1); | |||
@@ -188,7 +188,7 @@ TEST(TestAsyncQueue, SynchronizerMultiProducer3) { | |||
} | |||
TEST(TestAsyncQueue, SynchronizerWaiterStarving) { | |||
SCQueueSynchronizer sync; | |||
SCQueueSynchronizer sync(0); | |||
std::atomic_size_t processed{0}; | |||
auto worker = [&]() { | |||
while (sync.consumer_fetch(1)) { | |||