GitOrigin-RevId: d56f4ebf1f
tags/v0.4.0
@@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( | |||
{ | |||
auto type = cn_impl->env().property().type; | |||
mgb_throw_if(type != CompNode::DeviceType::CPU | |||
&& type != CompNode::DeviceType::CUDA | |||
, | |||
MegBrainError, | |||
"currently CPU can only wait for CPU" | |||
"currently CPU can only wait for CPU, CUDA" | |||
); | |||
} | |||
@@ -40,6 +40,16 @@ namespace { | |||
return std::max<size_t>(300 * 1024 * 1024, available / 20); | |||
} | |||
} | |||
using CudaHostFunc = megdnn::thin_function<void()>; | |||
void CUDART_CB cuda_host_func_caller(void* ud) { | |||
mgb_assert(ud); | |||
CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud); | |||
MGB_TRY { | |||
(*func_ptr)(); | |||
} MGB_FINALLY( | |||
delete func_ptr; | |||
); | |||
} | |||
} // anonymous namespace | |||
namespace mgb { | |||
@@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||
Locator locator_logical() override { | |||
return m_locator_logical; | |||
} | |||
void add_callback(CudaHostFunc&& cb) override { | |||
activate(); | |||
CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb)); | |||
MGB_TRY { | |||
MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream, | |||
cuda_host_func_caller, static_cast<void*>(func_ptr))); | |||
} MGB_CATCH(..., { | |||
delete func_ptr; | |||
throw; | |||
}); | |||
} | |||
}; | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | |||
@@ -28,15 +28,32 @@ namespace { | |||
//! implement non-contiguous d2d copy | |||
void noncont_tensor_copy( | |||
const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) { | |||
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||
const DeviceTensorND &dest, const DeviceTensorND &src, | |||
bool contig_dest, bool contig_src) { | |||
auto src_cn = src.comp_node(); | |||
auto dst_cn = dest.comp_node(); | |||
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||
dst_cn); | |||
dst_cn.activate(); | |||
relayout->exec( | |||
const_cast<DeviceTensorND&>(src).as_megdnn(), | |||
dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||
if (src_cn.device_type() == dst_cn.device_type()) { | |||
// perform relayout op for better performance when src and dst are | |||
// placed on comp nodes with the same device type | |||
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||
dst_cn); | |||
dst_cn.activate(); | |||
relayout->exec( | |||
const_cast<DeviceTensorND&>(src).as_megdnn(), | |||
dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||
} else { | |||
if (contig_src) { | |||
mgb_assert(!contig_dest); | |||
DeviceTensorND tmp{dst_cn}; | |||
tmp.copy_from(src); | |||
dest.copy_from_fixlayout(tmp); | |||
return; | |||
} | |||
DeviceTensorND tmp; | |||
tmp.copy_from(src); | |||
dest.copy_from_fixlayout(tmp); | |||
} | |||
} | |||
//! implement non-contiguous h2h copy | |||
@@ -346,7 +363,28 @@ template<> template<> | |||
void TensorStorage<DeviceTensorStorageTrait>::copy_from( | |||
const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | |||
mgb_assert(size <= this->size() && size <= src.size()); | |||
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||
if (src.comp_node().device_type() == CompNode::DeviceType::CPU && | |||
comp_node().device_type() == CompNode::DeviceType::CUDA) { | |||
// current thread(i.e. cuda dispatcher thread) should wait for all | |||
// operations on src's comp_node to finish, otherwise a race condition | |||
// might occur between the worker thread of src's comp_node and the | |||
// thread responsible for copying pageable memory in \p src to a pinned | |||
// buffer, refer to | |||
// https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html | |||
// | |||
// Note: it is highly recommended that copy tensor from cpu to cuda | |||
// with asynchronized disaptching(see graph option async_exec_level), | |||
// or main thread might be blocked by worker thread corresponding to | |||
// the src's comp_node, resulting in bad performance | |||
// | |||
// TODO: consider using cudaMallocHost or cudaHostRegister | |||
// to pin the memory of src tensor, so it does not require synchronization | |||
// and is more efficient | |||
src.comp_node().sync(); | |||
comp_node().copy_to_device(ptr(), src.ptr(), size); | |||
} else { | |||
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||
} | |||
} | |||
@@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) { | |||
TEST(TestGraph, CPUGPUHybrid) { | |||
REQUIRE_GPU(1); | |||
auto cn_cpu = CompNode::load("cpu:default"), | |||
cn_gpu = CompNode::load("gpu0"); | |||
auto graph = ComputingGraph::make(); | |||
HostTensorGenerator<> gen; | |||
auto host_x = gen({42}); | |||
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||
y = x * 2, | |||
z = opr::Copy::make(y, cn_gpu) + 1; | |||
HostTensorND host_z; | |||
auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
func->execute(); | |||
for (size_t i = 0; i < 42; ++ i) { | |||
MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1, | |||
host_z.ptr<float>()[i]); | |||
auto cn_gpu = CompNode::load("gpu0"); | |||
for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) { | |||
auto graph = ComputingGraph::make(); | |||
HostTensorGenerator<> gen; | |||
constexpr size_t length = 23333; | |||
auto host_x = gen({length}); | |||
graph->options().var_sanity_check_first_run = false; | |||
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||
y = opr::Sleep::make(x, 0.5) * 2, | |||
z_gpu = opr::Copy::make(y, cn_gpu) + 1, | |||
z = opr::Copy::make(z_gpu, cn_cpu) * 2; | |||
HostTensorND host_z; | |||
auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
func->execute(); | |||
for (size_t i = 0; i < length; ++ i) { | |||
MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2, | |||
host_z.ptr<float>()[i]); | |||
} | |||
} | |||
} | |||
TEST(TestGraph, In2OutOpStreamPropagate) { | |||
@@ -11,6 +11,7 @@ | |||
#include "megbrain/test/helper.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/opr/utility.h" | |||
#include "megbrain/utils/timer.h" | |||
@@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) { | |||
run_negative_index_test<HostTensorND, DeviceTensorND>(); | |||
} | |||
TEST(TestTensor, CpuCudaD2DCopy) { | |||
REQUIRE_GPU(1); | |||
auto cn_cpu = CompNode::load("cpu0"), | |||
cn_gpu = CompNode::load("gpu0"); | |||
HostTensorGenerator<> gen; | |||
constexpr size_t length = 233333; | |||
auto a = gen({length}); | |||
for (auto config: {true, false}) { | |||
DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()}; | |||
dev_a.copy_from(*a).sync(); | |||
if (!config) { | |||
auto subspec = Slice(0, length, 3).apply(a->layout(), 0); | |||
dev_a = dev_a.sub(subspec); | |||
dev_b = dev_b.sub(subspec); | |||
} | |||
auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0], | |||
stride = dev_a.layout().stride[0]]() { | |||
for (size_t i = 0; i < length; ++ i) { | |||
ptr[i * stride] += 1; | |||
} | |||
}; | |||
CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd); | |||
auto event = cn_cpu.create_event(); | |||
event->record(); | |||
cn_gpu.device_wait_event(*event); | |||
dev_b.copy_from_fixlayout(dev_a); | |||
HostTensorND res; | |||
res.copy_from(dev_b).sync(); | |||
MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res); | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |