@@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() { | |||||
printf("Total time: %4.3fms\n", total_time); | printf("Total time: %4.3fms\n", total_time); | ||||
} | } | ||||
std::shared_ptr<json::Value> TensorRTProfiler::to_json() { | |||||
using namespace json; | |||||
auto prof_arr = Array::make(); | |||||
for (auto&& rec : profile) { | |||||
auto&& item = Array::make(); | |||||
item->add(String::make(rec.first)); | |||||
item->add(Number::make(rec.second)); | |||||
prof_arr->add(item); | |||||
} | |||||
return prof_arr; | |||||
} | |||||
#endif // MGB_ENABLE_JSON | #endif // MGB_ENABLE_JSON | ||||
@@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) { | |||||
void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | ||||
CompNode comp_node_check, | CompNode comp_node_check, | ||||
nvinfer1::ICudaEngine* engine, | nvinfer1::ICudaEngine* engine, | ||||
size_t batch) { | |||||
size_t batch, bool use_trt_profiler) { | |||||
auto comp_node = opr->comp_node(); | auto comp_node = opr->comp_node(); | ||||
// ICudaEngine is bound to the currently active device | // ICudaEngine is bound to the currently active device | ||||
@@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||||
comp_node_check.to_string().c_str(), | comp_node_check.to_string().c_str(), | ||||
comp_node.to_string().c_str()); | comp_node.to_string().c_str()); | ||||
} | } | ||||
#if MGB_ENABLE_JSON | |||||
auto pf_holder_pair = | |||||
opr->owner_graph() | |||||
->options() | |||||
.user_data.get_user_data<opr_profile::OprProfileHolder>(); | |||||
if (m_has_profiler && !pf_holder_pair.second) { | |||||
m_context.reset(); | |||||
m_has_profiler = false; | |||||
} | |||||
#endif | |||||
auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr(); | auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr(); | ||||
bool should_reinit_device_memory = | bool should_reinit_device_memory = | ||||
!m_context || m_device_workspace_memory_ptr != workspace_ptr; | !m_context || m_device_workspace_memory_ptr != workspace_ptr; | ||||
if (!m_context) { | if (!m_context) { | ||||
m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}}; | m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}}; | ||||
m_has_profiler = false; | |||||
} | } | ||||
m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1); | m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1); | ||||
bool is_trt_opr = false; | bool is_trt_opr = false; | ||||
@@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||||
bool exec_success = false; | bool exec_success = false; | ||||
#if MGB_ENABLE_JSON | |||||
if (!pf_holder_pair.second) { | |||||
mgb_assert(!m_has_profiler, | |||||
"Invalid state of TensorRTRuntimeOpr: should not have " | |||||
"profiler."); | |||||
if (!use_trt_profiler) { | |||||
#if NV_TENSOR_RT_VERSION >= 6001 | #if NV_TENSOR_RT_VERSION >= 6001 | ||||
if (is_trt_opr) | if (is_trt_opr) | ||||
exec_success = m_context->enqueueV2(m_trt_iobuf.data(), | exec_success = m_context->enqueueV2(m_trt_iobuf.data(), | ||||
@@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||||
} else { | } else { | ||||
TensorRTProfiler trt_profiler; | TensorRTProfiler trt_profiler; | ||||
m_context->setProfiler(&trt_profiler); | m_context->setProfiler(&trt_profiler); | ||||
m_has_profiler = true; | |||||
// TensorRT documentation stated that IExecutionContext->execute | // TensorRT documentation stated that IExecutionContext->execute | ||||
// "Synchronously execute inference on a batch", and it does not take a | // "Synchronously execute inference on a batch", and it does not take a | ||||
// cudaStream_t, we expect it do a device synchronize. But it seems like | // cudaStream_t, we expect it do a device synchronize. But it seems like | ||||
@@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||||
exec_success = m_context->execute(batch, m_trt_iobuf.data()); | exec_success = m_context->execute(batch, m_trt_iobuf.data()); | ||||
#endif | #endif | ||||
mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname()); | mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname()); | ||||
pf_holder_pair.first[0]->id2object_map[opr] = trt_profiler.to_json(); | |||||
printf("TRT profile info of opr %s:\n", opr->name().c_str()); | printf("TRT profile info of opr %s:\n", opr->name().c_str()); | ||||
trt_profiler.print_layer_times(); | trt_profiler.print_layer_times(); | ||||
} | } | ||||
#else | |||||
#if NV_TENSOR_RT_VERSION >= 6001 | |||||
if (is_trt_opr) | |||||
exec_success = m_context->enqueueV2(m_trt_iobuf.data(), | |||||
env.cuda_env().stream, nullptr); | |||||
else | |||||
exec_success = m_context->enqueue(batch, m_trt_iobuf.data(), | |||||
env.cuda_env().stream, nullptr); | |||||
#else | |||||
exec_success = m_context->enqueue(batch, m_trt_iobuf.data(), | |||||
env.cuda_env().stream, nullptr); | |||||
#endif | |||||
mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname()); | |||||
#endif | |||||
} | } | ||||
/* ========================== TensorRTOpr ========================== */ | /* ========================== TensorRTOpr ========================== */ | ||||
@@ -50,11 +50,11 @@ class TensorRTManager { | |||||
std::vector<void*> m_trt_iobuf; | std::vector<void*> m_trt_iobuf; | ||||
TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context; | TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context; | ||||
void* m_device_workspace_memory_ptr; | void* m_device_workspace_memory_ptr; | ||||
bool m_has_profiler; | |||||
public: | public: | ||||
void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check, | void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check, | ||||
nvinfer1::ICudaEngine* engine, size_t batch = 1); | |||||
nvinfer1::ICudaEngine* engine, size_t batch = 1, | |||||
bool use_trt_profiler = false); | |||||
void clear_trt_context() { m_context.reset(); } | void clear_trt_context() { m_context.reset(); } | ||||
@@ -28,50 +28,6 @@ using namespace mgb; | |||||
using namespace nvinfer1; | using namespace nvinfer1; | ||||
using namespace opr; | using namespace opr; | ||||
TEST(TestOprTensorRT, Profile) { | |||||
REQUIRE_GPU(1); | |||||
intl::ConcatConvTensorRTNetwork net; | |||||
auto p = net.create_trt_network(true); | |||||
auto y2 = TensorRTOpr::make(TensorRTOpr::to_shared_ptr_builder(p.first), | |||||
TensorRTOpr::to_shared_ptr_network(p.second), | |||||
intl::TensorRTGraphFeatureBits::NCHW_FLOAT, {}, | |||||
{net.x0, net.x1})[0]; | |||||
HostTensorND host_z1; | |||||
HostTensorND host_z2; | |||||
auto func = net.graph->compile({make_callback_copy(net.y, host_z1), | |||||
make_callback_copy(y2, host_z2)}); | |||||
{ | |||||
mgb::GraphProfiler profiler(net.graph.get()); | |||||
func->execute(); | |||||
profiler.to_json()->writeto_fpath( | |||||
output_file("TestOprTensorRT.Profile.FromProfiler.json")); | |||||
auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get()); | |||||
auto record_obj = | |||||
*static_cast<json::Object*>(prof_obj["opr_internal_pf"].get()); | |||||
auto opr_prof_arr = *static_cast<json::Array*>( | |||||
record_obj[y2.node()->owner_opr()->id_str()].get()); | |||||
for (auto item_arr : opr_prof_arr.get_impl()) { | |||||
auto layer_info_arr = *static_cast<json::Array*>(item_arr.get()); | |||||
auto layer_time = | |||||
*static_cast<json::Number*>(layer_info_arr[1].get()); | |||||
mgb_assert(layer_time.get_impl() > 0, "Error occured in json."); | |||||
} | |||||
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||||
} | |||||
// Run it again after profiler is not in existance. | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||||
} | |||||
TEST(TestOprTensorRT, Basic) { | TEST(TestOprTensorRT, Basic) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
intl::SimpleTensorRTNetwork net; | intl::SimpleTensorRTNetwork net; | ||||
@@ -10,7 +10,6 @@ | |||||
*/ | */ | ||||
#include "megbrain/comp_node_env.h" | #include "megbrain/comp_node_env.h" | ||||
#include "megbrain/plugin/profiler.h" | |||||
#include "megbrain/test/autocheck.h" | #include "megbrain/test/autocheck.h" | ||||
#include "megbrain/test/helper.h" | #include "megbrain/test/helper.h" | ||||
#include "megbrain/test/megdnn_helper.h" | #include "megbrain/test/megdnn_helper.h" | ||||
@@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) { | |||||
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | ||||
} | } | ||||
TEST(TestOprTensorRT, RuntimeProfile) { | |||||
REQUIRE_GPU(1); | |||||
intl::ConcatConvTensorRTNetwork net; | |||||
SymbolVar y2; | |||||
{ | |||||
auto p = net.create_trt_network(false); | |||||
TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}}; | |||||
TensorRTUniquePtr<IBuilder> builder{p.first, {}}; | |||||
builder->setMaxBatchSize(5); | |||||
#if NV_TENSOR_RT_VERSION >= 6001 | |||||
TensorRTUniquePtr<IBuilderConfig> build_config{ | |||||
builder->createBuilderConfig()}; | |||||
auto cuda_engine = | |||||
builder->buildEngineWithConfig(*trt_net, *build_config); | |||||
#else | |||||
auto cuda_engine = builder->buildCudaEngine(*trt_net); | |||||
#endif | |||||
TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}}; | |||||
FILE* fout = fopen(output_file("trt_cuda_engine").c_str(), "wb"); | |||||
auto wr = fwrite(mem->data(), 1, mem->size(), fout); | |||||
mgb_assert(wr == mem->size()); | |||||
fclose(fout); | |||||
y2 = TensorRTRuntimeOpr::make( | |||||
TensorRTRuntimeOpr::to_shared_ptr_engine(cuda_engine), {}, | |||||
{net.x0, net.x1})[0]; | |||||
} | |||||
HostTensorND host_z1; | |||||
HostTensorND host_z2; | |||||
auto func = net.graph->compile({make_callback_copy(net.y, host_z1), | |||||
make_callback_copy(y2, host_z2)}); | |||||
{ | |||||
mgb::GraphProfiler profiler(net.graph.get()); | |||||
func->execute(); | |||||
profiler.to_json()->writeto_fpath(output_file( | |||||
"TestOprTensorRT.RuntimeProfile.FromProfiler.json")); | |||||
auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get()); | |||||
auto record_obj = | |||||
*static_cast<json::Object*>(prof_obj["opr_internal_pf"].get()); | |||||
auto opr_prof_arr = *static_cast<json::Array*>( | |||||
record_obj[y2.node()->owner_opr()->id_str()].get()); | |||||
for (auto item_arr : opr_prof_arr.get_impl()) { | |||||
auto layer_info_arr = *static_cast<json::Array*>(item_arr.get()); | |||||
auto layer_time = | |||||
*static_cast<json::Number*>(layer_info_arr[1].get()); | |||||
mgb_assert(layer_time.get_impl() > 0, "Error occured in json."); | |||||
} | |||||
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||||
} | |||||
// Run it again after profiler is not in existance. | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||||
} | |||||
TEST(TestOprTensorRT, RuntimeChangeBatchSize) { | TEST(TestOprTensorRT, RuntimeChangeBatchSize) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
intl::SimpleTensorRTNetwork net; | intl::SimpleTensorRTNetwork net; | ||||