@@ -51,6 +51,7 @@ void AtlasComputingContext::memcpy(void* dst, const void* src, | |||||
ACL_MEMCPY_HOST_TO_DEVICE)); | ACL_MEMCPY_HOST_TO_DEVICE)); | ||||
break; | break; | ||||
case megcoreMemcpyDeviceToDevice: | case megcoreMemcpyDeviceToDevice: | ||||
// async d2d is always faster than sync d2d because of SDMA | |||||
acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes, | acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes, | ||||
ACL_MEMCPY_DEVICE_TO_DEVICE, m_ctx.stream)); | ACL_MEMCPY_DEVICE_TO_DEVICE, m_ctx.stream)); | ||||
break; | break; | ||||
@@ -230,14 +230,10 @@ void AtlasCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest, | |||||
auto&& src_env = m_env.atlas_env(); | auto&& src_env = m_env.atlas_env(); | ||||
activate(); | activate(); | ||||
if (dst_env.device == src_env.device) { | if (dst_env.device == src_env.device) { | ||||
#if 1 | |||||
// async d2d use SDMA which is faster than sync ctrl cpu d2d | |||||
MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size, | MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size, | ||||
ACL_MEMCPY_DEVICE_TO_DEVICE, | ACL_MEMCPY_DEVICE_TO_DEVICE, | ||||
dst_env.stream)); | dst_env.stream)); | ||||
#else | |||||
MGB_ATLAS_CHECK(aclrtMemcpy(dest, size, src, size, | |||||
ACL_MEMCPY_DEVICE_TO_DEVICE)); | |||||
#endif | |||||
} else { | } else { | ||||
mgb_throw(MegBrainError, | mgb_throw(MegBrainError, | ||||
"Atlas does not support peer copy between differents " | "Atlas does not support peer copy between differents " | ||||
@@ -361,7 +361,6 @@ void AtlasRuntimeOpr::scn_do_execute() { | |||||
i, output(i)->cname()); | i, output(i)->cname()); | ||||
aclmdlAddDatasetBuffer(model_outputs, output_db); | aclmdlAddDatasetBuffer(model_outputs, output_db); | ||||
} | } | ||||
MGB_ATLAS_CHECK(aclmdlExecute(m_model_id, model_inputs, model_outputs)); | MGB_ATLAS_CHECK(aclmdlExecute(m_model_id, model_inputs, model_outputs)); | ||||
for (size_t i = 0; i < nr_inputs; ++i) { | for (size_t i = 0; i < nr_inputs; ++i) { | ||||