GitOrigin-RevId: 1c6c4a7a16
release-1.2
@@ -995,7 +995,7 @@ bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() { | |||||
} | } | ||||
void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() { | void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() { | ||||
for (size_t i = 0, it = SCQueueSynchronizer::max_spin() / 20; i < it; ++i) { | |||||
for (size_t i = 0, it = SCQueueSynchronizer::get_default_max_spin() / 20; i < it; ++i) { | |||||
if (finished()) { | if (finished()) { | ||||
return; | return; | ||||
} | } | ||||
@@ -73,7 +73,7 @@ CompNodeSyncManager& CompNodeSyncManager::busy_wait_set_ready() { | |||||
"before actually waiting on a tensor," | "before actually waiting on a tensor," | ||||
" you must call set_has_waiter first"); | " you must call set_has_waiter first"); | ||||
size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(); | |||||
size_t spin = 0, max_spin = SCQueueSynchronizer::get_default_max_spin(); | |||||
while (!m_nr_ready.load()) { | while (!m_nr_ready.load()) { | ||||
++spin; | ++spin; | ||||
if (spin >= max_spin) { | if (spin >= max_spin) { | ||||
@@ -72,26 +72,28 @@ namespace { | |||||
} | } | ||||
/* =============== SCQueueSynchronizer =============== */ | /* =============== SCQueueSynchronizer =============== */ | ||||
size_t SCQueueSynchronizer::cached_max_spin = 0; | |||||
size_t SCQueueSynchronizer::cached_default_max_spin = 0; | |||||
#ifdef WIN32 | #ifdef WIN32 | ||||
bool SCQueueSynchronizer::is_into_atexit = false; | bool SCQueueSynchronizer::is_into_atexit = false; | ||||
#endif | #endif | ||||
size_t SCQueueSynchronizer::max_spin() { | |||||
if (cached_max_spin) | |||||
return cached_max_spin; | |||||
size_t SCQueueSynchronizer::get_default_max_spin() { | |||||
if (cached_default_max_spin) | |||||
return cached_default_max_spin; | |||||
if (MGB_GETENV("MGB_WORKER_NO_SLEEP")) { | if (MGB_GETENV("MGB_WORKER_NO_SLEEP")) { | ||||
mgb_log_warn("worker would not sleep"); | mgb_log_warn("worker would not sleep"); | ||||
return cached_max_spin = std::numeric_limits<size_t>::max(); | |||||
return cached_default_max_spin = std::numeric_limits<size_t>::max(); | |||||
} | } | ||||
if (auto spin_string = MGB_GETENV("MGB_WORKER_MAX_SPIN")) { | if (auto spin_string = MGB_GETENV("MGB_WORKER_MAX_SPIN")) { | ||||
auto spin = std::stoi(spin_string); | auto spin = std::stoi(spin_string); | ||||
mgb_log_warn("worker would execute with spin of %d", spin); | mgb_log_warn("worker would execute with spin of %d", spin); | ||||
return cached_max_spin = spin; | |||||
return cached_default_max_spin = spin; | |||||
} | } | ||||
// heuristically, let CPU spinning around 5ms at most before CPU yield. | |||||
// we are going to measure how many spins will spent 5ms on current platform. | |||||
std::atomic_bool start{false}, stop{false}; | std::atomic_bool start{false}, stop{false}; | ||||
size_t cnt; | size_t cnt; | ||||
double cnt_time; | double cnt_time; | ||||
@@ -115,11 +117,13 @@ size_t SCQueueSynchronizer::max_spin() { | |||||
} | } | ||||
stop.store(true); | stop.store(true); | ||||
worker.join(); | worker.join(); | ||||
cached_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000); | |||||
return cached_max_spin; | |||||
cached_default_max_spin = std::max<size_t>(cnt * (5 / cnt_time), 100000); | |||||
return cached_default_max_spin; | |||||
} | } | ||||
SCQueueSynchronizer::SCQueueSynchronizer() = default; | |||||
SCQueueSynchronizer::SCQueueSynchronizer(size_t max_spin) { | |||||
m_max_spin = max_spin; | |||||
} | |||||
SCQueueSynchronizer::~SCQueueSynchronizer() noexcept { | SCQueueSynchronizer::~SCQueueSynchronizer() noexcept { | ||||
if (!m_worker_started) | if (!m_worker_started) | ||||
@@ -203,13 +207,13 @@ void SCQueueSynchronizer::producer_wait() { | |||||
size_t SCQueueSynchronizer::consumer_fetch(size_t max, size_t min) { | size_t SCQueueSynchronizer::consumer_fetch(size_t max, size_t min) { | ||||
mgb_assert(max >= min && min >= 1); | mgb_assert(max >= min && min >= 1); | ||||
size_t spin = 0, max_spin = SCQueueSynchronizer::max_spin(), | |||||
size_t spin = 0, | |||||
cur_finished = m_finished_task.load(std::memory_order_relaxed); | cur_finished = m_finished_task.load(std::memory_order_relaxed); | ||||
// relaxed mem order suffices because acquire would be called for ret | // relaxed mem order suffices because acquire would be called for ret | ||||
while (m_tot_task.load(std::memory_order_relaxed) < cur_finished + min) { | while (m_tot_task.load(std::memory_order_relaxed) < cur_finished + min) { | ||||
++ spin; | ++ spin; | ||||
if (spin >= max_spin) { | |||||
if (spin >= m_max_spin) { | |||||
while (m_consumer_waiting.test_and_set(std::memory_order_relaxed)); | while (m_consumer_waiting.test_and_set(std::memory_order_relaxed)); | ||||
SpinlockReleaser releaser(m_consumer_waiting); | SpinlockReleaser releaser(m_consumer_waiting); | ||||
@@ -46,15 +46,18 @@ namespace mgb { | |||||
class SCQueueSynchronizer { | class SCQueueSynchronizer { | ||||
public: | public: | ||||
static size_t max_spin() { | |||||
return 0; | |||||
} | |||||
SCQueueSynchronizer(size_t max_spin) {} | |||||
static size_t get_default_max_spin() { return 0; } | |||||
}; | }; | ||||
// tasks would be dispatched inplace | // tasks would be dispatched inplace | ||||
template<typename Param, class TaskImpl> | template<typename Param, class TaskImpl> | ||||
class AsyncQueueSC: public NonCopyableObj { | class AsyncQueueSC: public NonCopyableObj { | ||||
public: | public: | ||||
AsyncQueueSC() {} | |||||
AsyncQueueSC(size_t max_spin) {} | |||||
virtual ~AsyncQueueSC() = default; | virtual ~AsyncQueueSC() = default; | ||||
void add_task(const Param ¶m) { | void add_task(const Param ¶m) { | ||||
@@ -50,7 +50,11 @@ namespace mgb { | |||||
* wrap around within a practical time, which would crash the system. | * wrap around within a practical time, which would crash the system. | ||||
*/ | */ | ||||
class SCQueueSynchronizer { | class SCQueueSynchronizer { | ||||
static size_t cached_max_spin; | |||||
//! cached value for global default max spin, read and stored by get_default_max_spin | |||||
static size_t cached_default_max_spin; | |||||
//! synchronizer wait at most m_max_spin before CPU yield | |||||
size_t m_max_spin; | |||||
std::atomic_flag m_consumer_waiting = ATOMIC_FLAG_INIT; | std::atomic_flag m_consumer_waiting = ATOMIC_FLAG_INIT; | ||||
std::atomic_bool m_should_exit{false}; | std::atomic_bool m_should_exit{false}; | ||||
bool m_worker_started = false, m_wait_finish_called = false; | bool m_worker_started = false, m_wait_finish_called = false; | ||||
@@ -65,7 +69,8 @@ namespace mgb { | |||||
std::thread m_worker_thread; | std::thread m_worker_thread; | ||||
public: | public: | ||||
SCQueueSynchronizer(); | |||||
SCQueueSynchronizer(size_t max_spin); | |||||
~SCQueueSynchronizer() noexcept; | ~SCQueueSynchronizer() noexcept; | ||||
bool worker_started() const { | bool worker_started() const { | ||||
@@ -79,7 +84,8 @@ namespace mgb { | |||||
} | } | ||||
#endif | #endif | ||||
static size_t max_spin(); | |||||
//! get global default max spin from env | |||||
static size_t get_default_max_spin(); | |||||
void start_worker(std::thread thread); | void start_worker(std::thread thread); | ||||
@@ -150,6 +156,11 @@ namespace mgb { | |||||
}; | }; | ||||
public: | public: | ||||
AsyncQueueSC() : m_synchronizer(SCQueueSynchronizer::get_default_max_spin()) {} | |||||
//! specify max spin manually, caller must ensure the given value is optimal, | |||||
//! otherwise caller should leave the value adjustable by user. | |||||
AsyncQueueSC(size_t max_spin) : m_synchronizer(max_spin) {} | |||||
#ifdef WIN32 | #ifdef WIN32 | ||||
bool check_is_into_atexit() { | bool check_is_into_atexit() { | ||||
if (SCQueueSynchronizer::is_into_atexit) { | if (SCQueueSynchronizer::is_into_atexit) { | ||||
@@ -43,7 +43,7 @@ namespace { | |||||
template<int producer_sleep, int consumer_sleep> | template<int producer_sleep, int consumer_sleep> | ||||
void test_scq_sync_multi_producer() { | void test_scq_sync_multi_producer() { | ||||
size_t nr_worker_call = 0; | size_t nr_worker_call = 0; | ||||
SCQueueSynchronizer sync; | |||||
SCQueueSynchronizer sync(0); | |||||
auto worker = [&]() { | auto worker = [&]() { | ||||
RNGxorshf rng{next_rand_seed()}; | RNGxorshf rng{next_rand_seed()}; | ||||
while (auto nr = sync.consumer_fetch(1)) { | while (auto nr = sync.consumer_fetch(1)) { | ||||
@@ -87,7 +87,7 @@ namespace { | |||||
TEST(TestAsyncQueue, Synchronizer) { | TEST(TestAsyncQueue, Synchronizer) { | ||||
size_t nr_worker_call = 0; | size_t nr_worker_call = 0; | ||||
SCQueueSynchronizer sync; | |||||
SCQueueSynchronizer sync(0); | |||||
auto worker = [&]() { | auto worker = [&]() { | ||||
for (; ;) { | for (; ;) { | ||||
auto nr = sync.consumer_fetch(1); | auto nr = sync.consumer_fetch(1); | ||||
@@ -115,7 +115,7 @@ TEST(TestAsyncQueue, Synchronizer) { | |||||
TEST(TestAsyncQueue, SynchronizerWaitOverhead) { | TEST(TestAsyncQueue, SynchronizerWaitOverhead) { | ||||
{ | { | ||||
size_t nr_worker_call = 0; | size_t nr_worker_call = 0; | ||||
SCQueueSynchronizer sync; | |||||
SCQueueSynchronizer sync(0); | |||||
auto worker = [&]() { | auto worker = [&]() { | ||||
for (;;) { | for (;;) { | ||||
auto nr = sync.consumer_fetch(1); | auto nr = sync.consumer_fetch(1); | ||||
@@ -141,7 +141,7 @@ TEST(TestAsyncQueue, SynchronizerWaitOverhead) { | |||||
double worker_time = 0, avg_await; | double worker_time = 0, avg_await; | ||||
{ | { | ||||
size_t nr_worker_call = 0; | size_t nr_worker_call = 0; | ||||
SCQueueSynchronizer sync; | |||||
SCQueueSynchronizer sync(0); | |||||
auto worker = [&]() { | auto worker = [&]() { | ||||
for (;;) { | for (;;) { | ||||
auto nr = sync.consumer_fetch(1); | auto nr = sync.consumer_fetch(1); | ||||
@@ -188,7 +188,7 @@ TEST(TestAsyncQueue, SynchronizerMultiProducer3) { | |||||
} | } | ||||
TEST(TestAsyncQueue, SynchronizerWaiterStarving) { | TEST(TestAsyncQueue, SynchronizerWaiterStarving) { | ||||
SCQueueSynchronizer sync; | |||||
SCQueueSynchronizer sync(0); | |||||
std::atomic_size_t processed{0}; | std::atomic_size_t processed{0}; | ||||
auto worker = [&]() { | auto worker = [&]() { | ||||
while (sync.consumer_fetch(1)) { | while (sync.consumer_fetch(1)) { | ||||