Browse Source

fix(cambricon): fix cross cn copy for cambricon

GitOrigin-RevId: 21942a82a3
release-1.2
Megvii Engine Team 4 years ago
parent
commit
a5609f3bbf
3 changed files with 118 additions and 8 deletions
  1. +87
    -0
      src/cambricon/test/cambricon_runtime_opr.cpp
  2. +24
    -7
      src/core/impl/comp_node/cpu/comp_node.cpp
  3. +7
    -1
      src/core/impl/tensor.cpp

+ 87
- 0
src/cambricon/test/cambricon_runtime_opr.cpp View File

@@ -11,6 +11,7 @@

#include "megbrain/comp_node_env.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/plugin/profiler.h"
#include "megbrain/serialization/serializer.h"
#include "megbrain/test/helper.h"
@@ -557,6 +558,92 @@ TEST(TestCambriconRuntimeOpr, Profiling) {
profiler.to_json_full(func.get())
->writeto_fpath(output_file("cambricon_runtime_opr_profile.json"));
}

TEST(TestCambriconRuntimeOpr, CrossCNCopy) {
REQUIRE_CAMBRICON_DEVICE(1);
auto cn = CompNode::load("cambricon0");
CnmlModelContext ctx{cn, true};

// prepare parameter for addpad and conv
size_t ni = 16, ci = 64, hi = 32, wi = 32;
size_t no = 16, co = 64, ho = 32, wo = 32;

// count tensor nums
int conv_input_count = ni * hi * wi * ci;
int relu_output_count = no * ho * wo * co;

// prepare cpu origin data
std::vector<float> conv_input_cpu_data(conv_input_count);
std::vector<float> relu_output_cpu_data(relu_output_count);

// prepare input data for addpad
unsigned int seed = time(0);
for (int index = 0; index < conv_input_count; ++index) {
conv_input_cpu_data[index] = ((rand_r(&seed) % 100 / 100.0) - 0.5) / 2;
}

// prepare cpu data to converts to mlu memory
std::vector<int16_t> conv_input_cpu(conv_input_count);
std::vector<int16_t> relu_output_cpu(relu_output_count);
MGB_CNRT_CHECK(cnrtCastDataType(conv_input_cpu_data.data(), CNRT_FLOAT32,
conv_input_cpu.data(), CNRT_FLOAT16,
conv_input_count, nullptr));

auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
void* input_mlu_ptr;
void* output_mlu_ptr;

// malloc mlu mem for fusion input and output
MGB_CNRT_CHECK(
cnrtMalloc(&input_mlu_ptr, conv_input_count * sizeof(int16_t)));
MGB_CNRT_CHECK(
cnrtMalloc(&output_mlu_ptr, relu_output_count * sizeof(int16_t)));
// memory copy cpu->mlu
MGB_CNRT_CHECK(cnrtMemcpy(input_mlu_ptr, conv_input_cpu.data(),
conv_input_count * sizeof(int16_t),
CNRT_MEM_TRANS_DIR_HOST2DEV));
std::unique_ptr<void, decltype(mlu_deleter)> input_holder{input_mlu_ptr,
mlu_deleter};
std::unique_ptr<void, decltype(mlu_deleter)> output_holder{output_mlu_ptr,
mlu_deleter};

ctx.do_inference(&input_mlu_ptr, &output_mlu_ptr);

// result memory copy cnml->cpu
// memory copy cpu->mlu
MGB_CNRT_CHECK(cnrtMemcpy(relu_output_cpu.data(), output_mlu_ptr,
relu_output_count * sizeof(int16_t),
CNRT_MEM_TRANS_DIR_DEV2HOST));
MGB_CNRT_CHECK(cnrtCastDataType(relu_output_cpu.data(), CNRT_FLOAT16,
relu_output_cpu_data.data(), CNRT_FLOAT32,
relu_output_count, nullptr));
auto cn_cpu = CompNode::load("cpu0");
// cnml inference finished
auto buf = ctx.get_serialized_model();
std::shared_ptr<HostTensorND> input = std::make_shared<HostTensorND>(
cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()});
memcpy(reinterpret_cast<void*>(input->ptr<dt_float16>()),
conv_input_cpu.data(), conv_input_count * sizeof(int16_t));
auto graph = ComputingGraph::make();
auto host_x = opr::Host2DeviceCopy::make(*graph, input, {cn_cpu});
auto x = opr::Copy::make(host_x, {cn});
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0",
{x}, true)[0];
HostTensorND output(CompNode::default_cpu(), {no, co, ho, wo},
dtype::Float16());
auto func = graph->compile({make_callback_copy(y, output)});
func->execute();
HostTensorND out_cnml(cn_cpu, {no, co, ho, wo}, dtype::Float32()),
out_mgb(cn_cpu, {no, co, ho, wo}, dtype::Float32());
memcpy(out_cnml.ptr<float>(), relu_output_cpu_data.data(),
relu_output_count * sizeof(float));
MGB_CNRT_CHECK(
cnrtCastDataType(reinterpret_cast<void*>(output.ptr<dt_float16>()),
CNRT_FLOAT16, out_mgb.ptr<float>(), CNRT_FLOAT32,
relu_output_count, nullptr));
MGB_ASSERT_TENSOR_NEAR(out_cnml, out_mgb, 1e-4);
}

#endif

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 24
- 7
src/core/impl/comp_node/cpu/comp_node.cpp View File

@@ -397,7 +397,16 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase {
"Atlas comp_node used but "
"MGB_ATLAS not enabled");
#endif

} else if (dest_impl->env().property().type ==
DeviceType::CAMBRICON) {
#if MGB_CAMBRICON
dest_impl->copy_to_device(dest, src, size);
return;
#else
mgb_throw(MegBrainError,
"Cambricon comp_node used but "
"MGB_CAMBRICON not enabled");
#endif

} else {
mgb_assert(locator().device == Locator::DEVICE_CPU_DEFAULT,
@@ -912,12 +921,13 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(

{
auto type = cn_impl->env().property().type;
mgb_throw_if(type != CompNode::DeviceType::CPU
&& type != CompNode::DeviceType::CUDA
&& type != CompNode::DeviceType::ATLAS
,
MegBrainError,
"currently CPU can only wait for CPU, CUDA, ATLAS"
mgb_throw_if(
type != CompNode::DeviceType::CPU &&
type != CompNode::DeviceType::CUDA
&& type != CompNode::DeviceType::ATLAS &&
type != CompNode::DeviceType::CAMBRICON,
MegBrainError,
"currently CPU can only wait for CPU, CUDA, ATLAS, CAMBRICON"
);
}

@@ -928,6 +938,13 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
mgb_throw(MegBrainError,
"Atlas comp_node used but MGB_ATLAS not enabled");
#endif
} else if (cn_impl->env().property().type == CompNode::DeviceType::CAMBRICON) {
#if MGB_CAMBRICON
return m_comp_node_impl->sync();
#else
mgb_throw(MegBrainError,
"Cambricon comp_node used but MGB_CAMBRICON not enabled");
#endif

}



+ 7
- 1
src/core/impl/tensor.cpp View File

@@ -677,7 +677,13 @@ void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) {
#endif
break;
#endif
case CompNode::DeviceType::CPU: {
#if MGB_CAMBRICON
case CompNode::DeviceType::CAMBRICON:
MGB_CNRT_CHECK(cnrtSyncQueue(env.cnrt_env().queue));
MGB_CNRT_CHECK(cnrtMemset(ptr, val, size));
break;
#endif
case CompNode::DeviceType::CPU: {
auto fill = [ptr, size, val]() { std::memset(ptr, val, size); };
env.cpu_env().dispatch(fill);
} break;


Loading…
Cancel
Save