GitOrigin-RevId: d56f4ebf1f
tags/v0.4.0
@@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( | |||||
{ | { | ||||
auto type = cn_impl->env().property().type; | auto type = cn_impl->env().property().type; | ||||
mgb_throw_if(type != CompNode::DeviceType::CPU | mgb_throw_if(type != CompNode::DeviceType::CPU | ||||
&& type != CompNode::DeviceType::CUDA | |||||
, | , | ||||
MegBrainError, | MegBrainError, | ||||
"currently CPU can only wait for CPU" | |||||
"currently CPU can only wait for CPU, CUDA" | |||||
); | ); | ||||
} | } | ||||
@@ -40,6 +40,16 @@ namespace { | |||||
return std::max<size_t>(300 * 1024 * 1024, available / 20); | return std::max<size_t>(300 * 1024 * 1024, available / 20); | ||||
} | } | ||||
} | } | ||||
using CudaHostFunc = megdnn::thin_function<void()>; | |||||
void CUDART_CB cuda_host_func_caller(void* ud) { | |||||
mgb_assert(ud); | |||||
CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud); | |||||
MGB_TRY { | |||||
(*func_ptr)(); | |||||
} MGB_FINALLY( | |||||
delete func_ptr; | |||||
); | |||||
} | |||||
} // anonymous namespace | } // anonymous namespace | ||||
namespace mgb { | namespace mgb { | ||||
@@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||||
Locator locator_logical() override { | Locator locator_logical() override { | ||||
return m_locator_logical; | return m_locator_logical; | ||||
} | } | ||||
void add_callback(CudaHostFunc&& cb) override { | |||||
activate(); | |||||
CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb)); | |||||
MGB_TRY { | |||||
MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream, | |||||
cuda_host_func_caller, static_cast<void*>(func_ptr))); | |||||
} MGB_CATCH(..., { | |||||
delete func_ptr; | |||||
throw; | |||||
}); | |||||
} | |||||
}; | }; | ||||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | ||||
@@ -28,15 +28,32 @@ namespace { | |||||
//! implement non-contiguous d2d copy | //! implement non-contiguous d2d copy | ||||
void noncont_tensor_copy( | void noncont_tensor_copy( | ||||
const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) { | |||||
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||||
const DeviceTensorND &dest, const DeviceTensorND &src, | |||||
bool contig_dest, bool contig_src) { | |||||
auto src_cn = src.comp_node(); | |||||
auto dst_cn = dest.comp_node(); | auto dst_cn = dest.comp_node(); | ||||
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||||
dst_cn); | |||||
dst_cn.activate(); | |||||
relayout->exec( | |||||
const_cast<DeviceTensorND&>(src).as_megdnn(), | |||||
dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||||
if (src_cn.device_type() == dst_cn.device_type()) { | |||||
// perform relayout op for better performance when src and dst are | |||||
// placed on comp nodes with the same device type | |||||
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||||
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||||
dst_cn); | |||||
dst_cn.activate(); | |||||
relayout->exec( | |||||
const_cast<DeviceTensorND&>(src).as_megdnn(), | |||||
dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||||
} else { | |||||
if (contig_src) { | |||||
mgb_assert(!contig_dest); | |||||
DeviceTensorND tmp{dst_cn}; | |||||
tmp.copy_from(src); | |||||
dest.copy_from_fixlayout(tmp); | |||||
return; | |||||
} | |||||
DeviceTensorND tmp; | |||||
tmp.copy_from(src); | |||||
dest.copy_from_fixlayout(tmp); | |||||
} | |||||
} | } | ||||
//! implement non-contiguous h2h copy | //! implement non-contiguous h2h copy | ||||
@@ -346,7 +363,28 @@ template<> template<> | |||||
void TensorStorage<DeviceTensorStorageTrait>::copy_from( | void TensorStorage<DeviceTensorStorageTrait>::copy_from( | ||||
const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | ||||
mgb_assert(size <= this->size() && size <= src.size()); | mgb_assert(size <= this->size() && size <= src.size()); | ||||
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||||
if (src.comp_node().device_type() == CompNode::DeviceType::CPU && | |||||
comp_node().device_type() == CompNode::DeviceType::CUDA) { | |||||
// current thread(i.e. cuda dispatcher thread) should wait for all | |||||
// operations on src's comp_node to finish, otherwise a race condition | |||||
// might occur between the worker thread of src's comp_node and the | |||||
// thread responsible for copying pageable memory in \p src to a pinned | |||||
// buffer, refer to | |||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html | |||||
// | |||||
// Note: it is highly recommended that copy tensor from cpu to cuda | |||||
// with asynchronized disaptching(see graph option async_exec_level), | |||||
// or main thread might be blocked by worker thread corresponding to | |||||
// the src's comp_node, resulting in bad performance | |||||
// | |||||
// TODO: consider using cudaMallocHost or cudaHostRegister | |||||
// to pin the memory of src tensor, so it does not require synchronization | |||||
// and is more efficient | |||||
src.comp_node().sync(); | |||||
comp_node().copy_to_device(ptr(), src.ptr(), size); | |||||
} else { | |||||
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||||
} | |||||
} | } | ||||
@@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) { | |||||
TEST(TestGraph, CPUGPUHybrid) { | TEST(TestGraph, CPUGPUHybrid) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn_cpu = CompNode::load("cpu:default"), | |||||
cn_gpu = CompNode::load("gpu0"); | |||||
auto graph = ComputingGraph::make(); | |||||
HostTensorGenerator<> gen; | |||||
auto host_x = gen({42}); | |||||
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||||
y = x * 2, | |||||
z = opr::Copy::make(y, cn_gpu) + 1; | |||||
HostTensorND host_z; | |||||
auto func = graph->compile({make_callback_copy(z, host_z)}); | |||||
func->execute(); | |||||
for (size_t i = 0; i < 42; ++ i) { | |||||
MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1, | |||||
host_z.ptr<float>()[i]); | |||||
auto cn_gpu = CompNode::load("gpu0"); | |||||
for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) { | |||||
auto graph = ComputingGraph::make(); | |||||
HostTensorGenerator<> gen; | |||||
constexpr size_t length = 23333; | |||||
auto host_x = gen({length}); | |||||
graph->options().var_sanity_check_first_run = false; | |||||
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||||
y = opr::Sleep::make(x, 0.5) * 2, | |||||
z_gpu = opr::Copy::make(y, cn_gpu) + 1, | |||||
z = opr::Copy::make(z_gpu, cn_cpu) * 2; | |||||
HostTensorND host_z; | |||||
auto func = graph->compile({make_callback_copy(z, host_z)}); | |||||
func->execute(); | |||||
for (size_t i = 0; i < length; ++ i) { | |||||
MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2, | |||||
host_z.ptr<float>()[i]); | |||||
} | |||||
} | } | ||||
} | } | ||||
TEST(TestGraph, In2OutOpStreamPropagate) { | TEST(TestGraph, In2OutOpStreamPropagate) { | ||||
@@ -11,6 +11,7 @@ | |||||
#include "megbrain/test/helper.h" | #include "megbrain/test/helper.h" | ||||
#include "megbrain/comp_node_env.h" | |||||
#include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
#include "megbrain/opr/utility.h" | #include "megbrain/opr/utility.h" | ||||
#include "megbrain/utils/timer.h" | #include "megbrain/utils/timer.h" | ||||
@@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) { | |||||
run_negative_index_test<HostTensorND, DeviceTensorND>(); | run_negative_index_test<HostTensorND, DeviceTensorND>(); | ||||
} | } | ||||
TEST(TestTensor, CpuCudaD2DCopy) { | |||||
REQUIRE_GPU(1); | |||||
auto cn_cpu = CompNode::load("cpu0"), | |||||
cn_gpu = CompNode::load("gpu0"); | |||||
HostTensorGenerator<> gen; | |||||
constexpr size_t length = 233333; | |||||
auto a = gen({length}); | |||||
for (auto config: {true, false}) { | |||||
DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()}; | |||||
dev_a.copy_from(*a).sync(); | |||||
if (!config) { | |||||
auto subspec = Slice(0, length, 3).apply(a->layout(), 0); | |||||
dev_a = dev_a.sub(subspec); | |||||
dev_b = dev_b.sub(subspec); | |||||
} | |||||
auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0], | |||||
stride = dev_a.layout().stride[0]]() { | |||||
for (size_t i = 0; i < length; ++ i) { | |||||
ptr[i * stride] += 1; | |||||
} | |||||
}; | |||||
CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd); | |||||
auto event = cn_cpu.create_event(); | |||||
event->record(); | |||||
cn_gpu.device_wait_event(*event); | |||||
dev_b.copy_from_fixlayout(dev_a); | |||||
HostTensorND res; | |||||
res.copy_from(dev_b).sync(); | |||||
MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res); | |||||
} | |||||
} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |