fix(mgb): fix TensorRT runtime opr profiling

GitOrigin-RevId: 3545aa53b2
4 years ago · bcbfbbd743
--- a/src/tensorrt/impl/tensorrt_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_opr.cpp
@@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() {
    printf("Total time: %4.3fms\n", total_time);
 }

 std::shared_ptr<json::Value> TensorRTProfiler::to_json() {
    using namespace json;
    auto prof_arr = Array::make();
    for (auto&& rec : profile) {
        auto&& item = Array::make();
        item->add(String::make(rec.first));
        item->add(Number::make(rec.second));
        prof_arr->add(item);
    }
    return prof_arr;
 }
 #endif  // MGB_ENABLE_JSON


@@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) {
 void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                           CompNode comp_node_check,
                           nvinfer1::ICudaEngine* engine,
                           size_t batch) {
                           size_t batch, bool use_trt_profiler) {

    auto comp_node = opr->comp_node();
    // ICudaEngine is bound to the currently active device
@@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                   comp_node_check.to_string().c_str(),
                   comp_node.to_string().c_str());
    }
 #if MGB_ENABLE_JSON
    auto pf_holder_pair =
            opr->owner_graph()
                    ->options()
                    .user_data.get_user_data<opr_profile::OprProfileHolder>();
    if (m_has_profiler && !pf_holder_pair.second) {
        m_context.reset();
        m_has_profiler = false;
    }
 #endif
    auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr();
    bool should_reinit_device_memory =
            !m_context || m_device_workspace_memory_ptr != workspace_ptr;
    if (!m_context) {
        m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}};
        m_has_profiler = false;
    }
    m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1);
    bool is_trt_opr = false;
@@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,

    bool exec_success = false;

 #if MGB_ENABLE_JSON
    if (!pf_holder_pair.second) {
        mgb_assert(!m_has_profiler,
                   "Invalid state of TensorRTRuntimeOpr: should not have "
                   "profiler.");
    if (!use_trt_profiler) {
 #if NV_TENSOR_RT_VERSION >= 6001
        if (is_trt_opr)
            exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
@@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
    } else {
        TensorRTProfiler trt_profiler;
        m_context->setProfiler(&trt_profiler);
        m_has_profiler = true;
        // TensorRT documentation stated that IExecutionContext->execute
        // "Synchronously execute inference on a batch", and it does not take a
        // cudaStream_t, we expect it do a device synchronize. But it seems like
@@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
        exec_success = m_context->execute(batch, m_trt_iobuf.data());
 #endif
        mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
        pf_holder_pair.first[0]->id2object_map[opr] = trt_profiler.to_json();
        printf("TRT profile info of opr %s:\n", opr->name().c_str());
        trt_profiler.print_layer_times();
    }
 #else
 #if NV_TENSOR_RT_VERSION >= 6001
    if (is_trt_opr)
        exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
                                            env.cuda_env().stream, nullptr);
    else
        exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
                                          env.cuda_env().stream, nullptr);
 #else
    exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
                                      env.cuda_env().stream, nullptr);
 #endif
    mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
 #endif
 }

 /* ========================== TensorRTOpr ========================== */
--- a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
+++ b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
@@ -50,11 +50,11 @@ class TensorRTManager {
    std::vector<void*> m_trt_iobuf;
    TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context;
    void* m_device_workspace_memory_ptr;
    bool m_has_profiler;

 public:
    void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check,
              nvinfer1::ICudaEngine* engine, size_t batch = 1);
              nvinfer1::ICudaEngine* engine, size_t batch = 1,
              bool use_trt_profiler = false);

    void clear_trt_context() { m_context.reset(); }

--- a/src/tensorrt/test/tensorrt.cpp
+++ b/src/tensorrt/test/tensorrt.cpp
@@ -28,50 +28,6 @@ using namespace mgb;
 using namespace nvinfer1;
 using namespace opr;

 TEST(TestOprTensorRT, Profile) {
    REQUIRE_GPU(1);
    intl::ConcatConvTensorRTNetwork net;

    auto p = net.create_trt_network(true);

    auto y2 = TensorRTOpr::make(TensorRTOpr::to_shared_ptr_builder(p.first),
                                TensorRTOpr::to_shared_ptr_network(p.second),
                                intl::TensorRTGraphFeatureBits::NCHW_FLOAT, {},
                                {net.x0, net.x1})[0];

    HostTensorND host_z1;
    HostTensorND host_z2;
    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
                                    make_callback_copy(y2, host_z2)});
    {
        mgb::GraphProfiler profiler(net.graph.get());

        func->execute();

        profiler.to_json()->writeto_fpath(
                output_file("TestOprTensorRT.Profile.FromProfiler.json"));
        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());

        auto record_obj =
                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
        auto opr_prof_arr = *static_cast<json::Array*>(
                record_obj[y2.node()->owner_opr()->id_str()].get());
        for (auto item_arr : opr_prof_arr.get_impl()) {
            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
            auto layer_time =
                    *static_cast<json::Number*>(layer_info_arr[1].get());

            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
        }

        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
    }
    // Run it again after profiler is not in existance.
    func->execute();

    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
 }

 TEST(TestOprTensorRT, Basic) {
    REQUIRE_GPU(1);
    intl::SimpleTensorRTNetwork net;
--- a/src/tensorrt/test/tensorrt_runtime.cpp
+++ b/src/tensorrt/test/tensorrt_runtime.cpp
@@ -10,7 +10,6 @@
 */

 #include "megbrain/comp_node_env.h"
 #include "megbrain/plugin/profiler.h"
 #include "megbrain/test/autocheck.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/test/megdnn_helper.h"
@@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) {
    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
 }

 TEST(TestOprTensorRT, RuntimeProfile) {
    REQUIRE_GPU(1);
    intl::ConcatConvTensorRTNetwork net;
    SymbolVar y2;
    {
        auto p = net.create_trt_network(false);
        TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
        TensorRTUniquePtr<IBuilder> builder{p.first, {}};
        builder->setMaxBatchSize(5);
 #if NV_TENSOR_RT_VERSION >= 6001
        TensorRTUniquePtr<IBuilderConfig> build_config{
                builder->createBuilderConfig()};
        auto cuda_engine =
                builder->buildEngineWithConfig(*trt_net, *build_config);
 #else
        auto cuda_engine = builder->buildCudaEngine(*trt_net);
 #endif
        TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};

        FILE* fout = fopen(output_file("trt_cuda_engine").c_str(), "wb");
        auto wr = fwrite(mem->data(), 1, mem->size(), fout);
        mgb_assert(wr == mem->size());
        fclose(fout);

        y2 = TensorRTRuntimeOpr::make(
                TensorRTRuntimeOpr::to_shared_ptr_engine(cuda_engine), {},
                {net.x0, net.x1})[0];
    }

    HostTensorND host_z1;
    HostTensorND host_z2;
    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
                                    make_callback_copy(y2, host_z2)});

    {
        mgb::GraphProfiler profiler(net.graph.get());

        func->execute();

        profiler.to_json()->writeto_fpath(output_file(
                "TestOprTensorRT.RuntimeProfile.FromProfiler.json"));

        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());
        auto record_obj =
                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
        auto opr_prof_arr = *static_cast<json::Array*>(
                record_obj[y2.node()->owner_opr()->id_str()].get());
        for (auto item_arr : opr_prof_arr.get_impl()) {
            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
            auto layer_time =
                    *static_cast<json::Number*>(layer_info_arr[1].get());

            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
        }

        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
    }
    // Run it again after profiler is not in existance.
    func->execute();

    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
 }

 TEST(TestOprTensorRT, RuntimeChangeBatchSize) {
    REQUIRE_GPU(1);
    intl::SimpleTensorRTNetwork net;