GitOrigin-RevId: 861e349eb4
tags/v1.0.0-rc1
@@ -38,7 +38,9 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | |||
option(BUILD_SHARED_LIBS "Build shared libraries" ON) | |||
option(MGE_WITH_ATLAS "Build MegEngine with Atlas support" OFF) | |||
option(MGE_ENABLE_RTTI "Build with RTTI" ON) | |||
option(MGE_ENABLE_LOGGING "Build with logging" ON) | |||
option(MGE_DEBUG_UTIL "Enable debug utility" ON) | |||
@@ -406,6 +408,51 @@ if(MGE_WITH_CUDA) | |||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
include_directories("$ENV{NEUWARE_HOME}/include") | |||
link_directories("$ENV{NEUWARE_HOME}/lib64") | |||
include(cmake/FindBANG/FindBANG.cmake) | |||
if (${MGE_MLU_ARCH} STREQUAL "MLU100") | |||
set(BANG_ARCH "100") | |||
elseif (${MGE_MLU_ARCH} STREQUAL "MLU1h8") | |||
set(BANG_ARCH "110") | |||
elseif (${MGE_MLU_ARCH} STREQUAL "MLU220") | |||
set(BANG_ARCH "220") | |||
elseif (${MGE_MLU_ARCH} STREQUAL "MLU270") | |||
set(BANG_ARCH "270") | |||
elseif (${MGE_MLU_ARCH} STREQUAL "MLU290") | |||
set(BANG_ARCH "290") | |||
elseif (${MGE_MLU_ARCH} STREQUAL "MLU200") | |||
set(BANG_ARCH "200") | |||
else() | |||
message (FATAL_ERROR "Unsupported MLU arch.") | |||
endif() | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} --bang-mlu-arch=${MGE_MLU_ARCH}") | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -std=c++11 -Werror") | |||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__BANG_ARCH__=${BANG_ARCH}") | |||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -g -O0") | |||
elseif (${CMAKE_BUILD_TYPE} STREQUAL "Release") | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3") | |||
elseif (${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -g -O3") | |||
elseif (${CMAKE_BUILD_TYPE} STREQUAL "MinSizeRel") | |||
set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -Os") | |||
endif() | |||
include(cmake/cnrt.cmake) | |||
include(cmake/cndev.cmake) | |||
include(cmake/cnml.cmake) | |||
list(APPEND MGE_CAMBRICON_LIBS libcnrt libcndev libcnml) | |||
set(MGE_CAMBRICON_LIBS "${MGE_CAMBRICON_LIBS}") | |||
endif() | |||
if(MGE_WITH_ATLAS) | |||
include(cmake/aclrt.cmake) | |||
list(APPEND MGE_ATLAS_LIBS libascendcl) | |||
set(MGE_ATLAS_LIBS "${MGE_ATLAS_LIBS}") | |||
set(MGB_ATLAS ${MGE_WITH_ATLAS}) | |||
endif() | |||
find_program(CCACHE_BIN ccache) | |||
if(CCACHE_BIN) | |||
@@ -494,6 +541,11 @@ set(MGB_CUDA ${MGE_WITH_CUDA}) | |||
set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA}) | |||
# CAMBRICON | |||
set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON}) | |||
set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON}) | |||
# Debug info | |||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") | |||
set(MGB_ASSERT_LOC 1) | |||
@@ -0,0 +1,35 @@ | |||
if($ENV{LIBRARY_PATH}) | |||
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | |||
endif() | |||
find_library(ACLRT_LIBRARY | |||
NAMES libascendcl.so | |||
PATHS $ENV{LD_LIBRARY_PATH} "$ENV{ACLRT_HOME}/lib64/stub" ${CMAKE_INSTALL_PREFIX} | |||
HINTS ${SYSTEM_LIBRARY_PATHS} | |||
PATH_SUFFIXES stub | |||
DOC "ACL library." ) | |||
if(ACLRT_LIBRARY STREQUAL "ACLRT_LIBRARY-NOTFOUND") | |||
message(FATAL_ERROR "Can not find ACLRT Library") | |||
endif() | |||
get_filename_component(__found_aclrt_root "${ACLRT_LIBRARY}/../../../" REALPATH) | |||
find_path(ACLRT_INCLUDE_DIR | |||
NAMES acl/acl.h | |||
HINTS "$ENV{ACLRT_HOME}/include" ${__found_aclrt_root} | |||
PATH_SUFFIXES include | |||
DOC "Path to ACLRT include directory." ) | |||
if(ACLRT_INCLUDE_DIR STREQUAL "ACLRT_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find ACLRT Library") | |||
endif() | |||
add_library(libascendcl SHARED IMPORTED) | |||
set_target_properties(libascendcl PROPERTIES | |||
IMPORTED_LOCATION ${ACLRT_LIBRARY} | |||
INTERFACE_INCLUDE_DIRECTORIES ${ACLRT_INCLUDE_DIR} | |||
) | |||
message("-- Found ACLRT: ${__found_aclrt_root}") | |||
@@ -0,0 +1,48 @@ | |||
if($ENV{LIBRARY_PATH}) | |||
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | |||
endif() | |||
find_library(CNDEV_LIBRARY | |||
NAMES libcndev.so | |||
PATHS $ENV{LD_LIBRARY_PATH} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} | |||
HINTS ${SYSTEM_LIBRARY_PATHS} | |||
PATH_SUFFIXES lib lib64 | |||
DOC "CNDEV library." ) | |||
if(CNDEV_LIBRARY STREQUAL "CNDEV_LIBRARY-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNDEV Library") | |||
endif() | |||
get_filename_component(__found_cndev_root "${CNDEV_LIBRARY}/../include" REALPATH) | |||
find_path(CNDEV_INCLUDE_DIR | |||
NAMES cndev.h | |||
HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cndev_root} | |||
PATH_SUFFIXES include | |||
DOC "Path to CNDEV include directory." ) | |||
if(CNDEV_INCLUDE_DIR STREQUAL "CNDEV_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNDEV Library") | |||
endif() | |||
file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_1 REGEX "^#define CNDEV_VERSION_1 [0-9]+.*$") | |||
file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_2 REGEX "^#define CNDEV_VERSION_2 [0-9]+.*$") | |||
file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_3 REGEX "^#define CNDEV_VERSION_3 [0-9]+.*$") | |||
file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_4 REGEX "^#define CNDEV_VERSION_4 [0-9]+.*$") | |||
file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_5 REGEX "^#define CNDEV_VERSION_5 [0-9]+.*$") | |||
string(REGEX REPLACE "^#define CNDEV_VERSION_1 ([0-9]+).*$" "\\1" CNDEV_VERSION_1 "${CNDEV_1}") | |||
string(REGEX REPLACE "^#define CNDEV_VERSION_2 ([0-9]+).*$" "\\1" CNDEV_VERSION_2 "${CNDEV_2}") | |||
string(REGEX REPLACE "^#define CNDEV_VERSION_3 ([0-9]+).*$" "\\1" CNDEV_VERSION_3 "${CNDEV_3}") | |||
string(REGEX REPLACE "^#define CNDEV_VERSION_4 ([0-9]+).*$" "\\1" CNDEV_VERSION_4 "${CNDEV_4}") | |||
string(REGEX REPLACE "^#define CNDEV_VERSION_5 ([0-9]+).*$" "\\1" CNDEV_VERSION_5 "${CNDEV_5}") | |||
set(CNDEV_VERSION_STRING "${CNDEV_VERSION_1}.${CNDEV_VERSION_2}.${CNDEV_VERSION_3}.${CNDEV_VERSION_4}.${CNDEV_VERSION_5}") | |||
add_library(libcndev SHARED IMPORTED) | |||
set_target_properties(libcndev PROPERTIES | |||
IMPORTED_LOCATION ${CNDEV_LIBRARY} | |||
INTERFACE_INCLUDE_DIRECTORIES ${CNDEV_INCLUDE_DIR} | |||
) | |||
message("-- Found CNDEV: ${__found_cndev_root} (found version: ${CNDEV_VERSION_STRING})") | |||
@@ -0,0 +1,44 @@ | |||
if($ENV{LIBRARY_PATH}) | |||
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | |||
endif() | |||
find_library(CNML_LIBRARY | |||
NAMES libcnml.so | |||
PATHS $ENV{LD_LIBRARY_PATH} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} | |||
HINTS ${SYSTEM_LIBRARY_PATHS} | |||
PATH_SUFFIXES lib lib64 | |||
DOC "CNML library." ) | |||
if(CNML_LIBRARY STREQUAL "CNML_LIBRARY-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNML Library") | |||
endif() | |||
get_filename_component(__found_cnml_root "${CNML_LIBRARY}/../include" REALPATH) | |||
find_path(CNML_INCLUDE_DIR | |||
NAMES cnml.h | |||
HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnml_root} | |||
PATH_SUFFIXES include | |||
DOC "Path to CNML include directory." ) | |||
if(CNML_INCLUDE_DIR STREQUAL "CNML_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNML Library") | |||
endif() | |||
file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MAJOR REGEX "^#define CNML_MAJOR_VERSION [0-9]+.*$") | |||
file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MINOR REGEX "^#define CNML_MINOR_VERSION [0-9]+.*$") | |||
file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_PATCH REGEX "^#define CNML_PATCH_VERSION [0-9]+.*$") | |||
string(REGEX REPLACE "^#define CNML_MAJOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MAJOR "${CNML_MAJOR}") | |||
string(REGEX REPLACE "^#define CNML_MINOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MINOR "${CNML_MINOR}") | |||
string(REGEX REPLACE "^#define CNML_PATCH_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_PATCH "${CNML_PATCH}") | |||
set(CNML_VERSION_STRING "${CNML_VERSION_MAJOR}.${CNML_VERSION_MINOR}.${CNML_VERSION_PATCH}") | |||
add_library(libcnml SHARED IMPORTED) | |||
set_target_properties(libcnml PROPERTIES | |||
IMPORTED_LOCATION ${CNML_LIBRARY} | |||
INTERFACE_INCLUDE_DIRECTORIES ${CNML_INCLUDE_DIR} | |||
) | |||
message("-- Found CNML: ${__found_cnml_root} (found version: ${CNML_VERSION_STRING})") | |||
@@ -0,0 +1,44 @@ | |||
if($ENV{LIBRARY_PATH}) | |||
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | |||
endif() | |||
find_library(CNRT_LIBRARY | |||
NAMES libcnrt.so | |||
PATHS $ENV{LD_LIBRARY_PATH} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} | |||
HINTS ${SYSTEM_LIBRARY_PATHS} | |||
PATH_SUFFIXES lib lib64 | |||
DOC "CNRT library." ) | |||
if(CNRT_LIBRARY STREQUAL "CNRT_LIBRARY-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNRT Library") | |||
endif() | |||
get_filename_component(__found_cnrt_root "${CNRT_LIBRARY}/../include" REALPATH) | |||
find_path(CNRT_INCLUDE_DIR | |||
NAMES cnrt.h | |||
HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnrt_root} | |||
PATH_SUFFIXES include | |||
DOC "Path to CNRT include directory." ) | |||
if(CNRT_INCLUDE_DIR STREQUAL "CNRT_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CNRT Library") | |||
endif() | |||
file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MAJOR REGEX "^#define CNRT_MAJOR_VERSION [0-9]+.*$") | |||
file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MINOR REGEX "^#define CNRT_MINOR_VERSION [0-9]+.*$") | |||
file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_PATCH REGEX "^#define CNRT_PATCH_VERSION [0-9]+.*$") | |||
string(REGEX REPLACE "^#define CNRT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MAJOR "${CNRT_MAJOR}") | |||
string(REGEX REPLACE "^#define CNRT_MINOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MINOR "${CNRT_MINOR}") | |||
string(REGEX REPLACE "^#define CNRT_PATCH_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_PATCH "${CNRT_PATCH}") | |||
set(CNRT_VERSION_STRING "${CNRT_VERSION_MAJOR}.${CNRT_VERSION_MINOR}.${CNRT_VERSION_PATCH}") | |||
add_library(libcnrt SHARED IMPORTED) | |||
set_target_properties(libcnrt PROPERTIES | |||
IMPORTED_LOCATION ${CNRT_LIBRARY} | |||
INTERFACE_INCLUDE_DIRECTORIES ${CNRT_INCLUDE_DIR} | |||
) | |||
message("-- Found CNRT: ${__found_cnrt_root} (found version: ${CNRT_VERSION_STRING})") | |||
@@ -0,0 +1,64 @@ | |||
/** | |||
* \file dnn/include/megcore_atlas.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megcore.h" | |||
#include <acl/acl.h> | |||
#include "megdnn/internal/visibility_prologue.h" | |||
namespace megcore { | |||
megcoreStatus_t createAtlasDeviceHandleWithGlobalInitStatus( | |||
megcoreDeviceHandle_t* devHandle, int deviceID, unsigned int flags, | |||
bool global_initialized); | |||
struct AtlasContext { | |||
aclrtStream stream = nullptr; | |||
AtlasContext() = default; | |||
AtlasContext(aclrtStream s) : stream{s} {} | |||
}; | |||
megcoreStatus_t createComputingHandleWithAtlasContext( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, const AtlasContext& ctx); | |||
megcoreStatus_t getAtlasContext(megcoreComputingHandle_t handle, | |||
AtlasContext* ctx); | |||
namespace atlas { | |||
//! convert acl error code to error string | |||
const char* get_error_str(aclError error); | |||
} // namespace atlas | |||
} // namespace megcore | |||
inline megcoreStatus_t megcoreCreateComputingHandleWithACLStream( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, aclrtStream stream) { | |||
megcore::AtlasContext ctx{stream}; | |||
return megcore::createComputingHandleWithAtlasContext(compHandle, devHandle, | |||
flags, ctx); | |||
} | |||
inline megcoreStatus_t megcoreGetACLStream(megcoreComputingHandle_t handle, | |||
aclrtStream* stream) { | |||
megcore::AtlasContext ctx; | |||
auto ret = megcore::getAtlasContext(handle, &ctx); | |||
*stream = ctx.stream; | |||
return ret; | |||
} | |||
#include "megdnn/internal/visibility_epilogue.h" | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,61 @@ | |||
/** | |||
* \file include/megcore_cambricon.h | |||
* | |||
* This file is part of MegDNN, a deep neural network run-time library | |||
* developed by Megvii. | |||
* | |||
* \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "megcore.h" | |||
#include <cndev.h> | |||
#include <cnml.h> | |||
#include <cnrt.h> | |||
#include "megdnn/internal/visibility_prologue.h" | |||
namespace megcore { | |||
megcoreStatus_t createDeviceHandleWithGlobalInitStatus( | |||
megcoreDeviceHandle_t* devHandle, int deviceID, unsigned int flags, | |||
bool global_initialized); | |||
struct CambriconContext { | |||
cnrtQueue_t queue = nullptr; | |||
CambriconContext() = default; | |||
CambriconContext(cnrtQueue_t q) : queue{q} {} | |||
}; | |||
megcoreStatus_t createComputingHandleWithCambriconContext( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, const CambriconContext& ctx); | |||
megcoreStatus_t getCambriconContext(megcoreComputingHandle_t handle, | |||
CambriconContext* ctx); | |||
} // namespace megcore | |||
static inline megcoreStatus_t megcoreCreateComputingHandleWithCNRTQueue( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, cnrtQueue_t queue) { | |||
megcore::CambriconContext ctx{queue}; | |||
return megcore::createComputingHandleWithCambriconContext( | |||
compHandle, devHandle, flags, ctx); | |||
} | |||
static inline megcoreStatus_t megcoreGetCNRTQueue( | |||
megcoreComputingHandle_t handle, cnrtQueue_t* queue) { | |||
megcore::CambriconContext ctx; | |||
auto ret = megcore::getCambriconContext(handle, &ctx); | |||
*queue = ctx.queue; | |||
return ret; | |||
} | |||
#include "megdnn/internal/visibility_epilogue.h" | |||
// vim: syntax=cpp.doxygen | |||
@@ -19,6 +19,8 @@ | |||
typedef enum { | |||
megcorePlatformCPU = 1, | |||
megcorePlatformCUDA = 4, | |||
megcorePlatformCambricon = 7, | |||
megcorePlatformAtlas = 8, | |||
} megcorePlatform_t; | |||
/** | |||
@@ -33,6 +33,8 @@ class Handle { | |||
ARMV7 = 4, | |||
AARCH64 = 5, | |||
CUDA = 6, | |||
ATLAS = 13, | |||
CAMBRICON = 12, | |||
}; | |||
protected: | |||
@@ -45,6 +45,24 @@ if(MGE_WITH_CUDA) | |||
list(APPEND SOURCES ${CUSOURCES}) | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
file(GLOB_RECURSE SOURCES_ cambricon/*.cpp) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
file(GLOB_RECURSE BANG_SOURCES cambricon/*.mlu) | |||
list(APPEND MEGDNN_INCLUDES "${PROJECT_SOURCE_DIR}/dnn/include") | |||
list(APPEND MEGDNN_INCLUDES "${PROJECT_SOURCE_DIR}/dnn") | |||
list(APPEND MEGDNN_INCLUDES "${PROJECT_BINARY_DIR}/genfiles") | |||
bang_compile(BANG_OBJS "${BANG_SOURCES}" "${MEGDNN_INCLUDES}") | |||
list(APPEND SOURCES ${BANG_OBJS}) | |||
endif() | |||
if(MGE_WITH_ATLAS) | |||
file(GLOB_RECURSE SOURCES_ atlas/*.cpp) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1) | |||
endif() | |||
add_definitions(${LIBMEGDNN_DEF}) | |||
@@ -97,8 +115,21 @@ else() | |||
target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) | |||
endif() | |||
if(MGE_WITH_ATLAS) | |||
if (BUILD_SHARED_LIBS) | |||
target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:${MGE_ATLAS_LIBS}>) | |||
else() | |||
target_link_libraries(megdnn PRIVATE ${MGE_ATLAS_LIBS}) | |||
endif() | |||
endif() | |||
if(CMAKE_THREAD_LIBS_INIT) | |||
target_link_libraries(megdnn PRIVATE Threads::Threads) | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
target_link_libraries(megdnn PRIVATE ${BANG_OBJS} ${MGE_CAMBRICON_LIBS}) | |||
endif() | |||
install(TARGETS megdnn EXPORT ${MGE_EXPORT_TARGETS}) |
@@ -0,0 +1,53 @@ | |||
/** | |||
* \file dnn/src/atlas/checksum/opr_impl.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/atlas/checksum/opr_impl.h" | |||
#include "src/atlas/utils.h" | |||
#include "src/naive/handle.h" | |||
#include "src/common/utils.h" | |||
#include "src/common/opr_delegate.h" | |||
#include <cstring> | |||
using namespace megdnn; | |||
using namespace atlas; | |||
size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout&) { | |||
return 0; | |||
} | |||
ChecksumForward::Result ChecksumForwardImpl::exec(_megdnn_tensor_in data, | |||
_megdnn_workspace workspace) { | |||
check_exec(data.layout, workspace.size); | |||
//! FIXME currently the cce programming interface is not so stable, here i | |||
//! just allocate some memory of cpu here and compute the result in cpu | |||
std::vector<uint8_t> cpu_data(data.layout.span().dist_byte(), 0); | |||
megcoreDeviceHandle_t dev_handle; | |||
megcoreComputingHandle_t comp_handle = handle()->megcore_computing_handle(); | |||
megcoreGetDeviceHandle(comp_handle, &dev_handle); | |||
megcoreMemcpy(comp_handle, cpu_data.data(), data.raw_ptr, cpu_data.size(), | |||
megcoreMemcpyDeviceToHost); | |||
megcoreSynchronize(comp_handle); | |||
auto opr = inplace_cpu_handle()->create_operator<ChecksumForward>(); | |||
size_t workspace_size = opr->get_workspace_in_bytes(data.layout); | |||
std::vector<uint8_t> cpu_workspace_data(workspace_size, 0); | |||
Workspace cpu_workspace( | |||
reinterpret_cast<dt_byte*>(cpu_workspace_data.data()), | |||
cpu_workspace_data.size()); | |||
return opr->exec(TensorND{cpu_data.data(), data.layout}, cpu_workspace); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,34 @@ | |||
/** | |||
* \file dnn/src/atlas/checksum/opr_impl.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megdnn/oprs.h" | |||
namespace megdnn { | |||
namespace atlas { | |||
class ChecksumForwardImpl final : public ChecksumForward { | |||
public: | |||
using ChecksumForward::ChecksumForward; | |||
bool is_thread_safe() const override { return true; } | |||
size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||
Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) override; | |||
}; | |||
} // namespace naive | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,59 @@ | |||
/** | |||
* \file dnn/src/atlas/handle.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megcore_atlas.h" | |||
#include "src/common/handle_impl.h" | |||
#include "src/atlas/handle.h" | |||
#include "src/atlas/checksum/opr_impl.h" | |||
#include <acl/acl.h> | |||
namespace megdnn { | |||
namespace atlas { | |||
HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) | |||
: HandleImplHelper(comp_handle, HandleType::ATLAS) { | |||
// Get megcore device handle | |||
megcoreDeviceHandle_t dev_handle; | |||
megcoreGetDeviceHandle(comp_handle, &dev_handle); | |||
int dev_id; | |||
megcoreGetDeviceID(dev_handle, &dev_id); | |||
m_device_id = dev_id; | |||
megcore::getAtlasContext(comp_handle, &m_megcore_context); | |||
} | |||
HandleImpl::~HandleImpl() noexcept = default; | |||
template <typename Opr> | |||
std::unique_ptr<Opr> HandleImpl::create_operator() { | |||
megdnn_throw("unsupported atlas opr"); | |||
return nullptr; | |||
} | |||
size_t HandleImpl::alignment_requirement() const { | |||
//! because memcpyasync api requires that the memory is 128bytes alignment | |||
return 64; | |||
} | |||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(ChecksumForward); | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wpragmas" | |||
#pragma GCC diagnostic ignored "-Winstantiation-after-specialization" | |||
MEGDNN_FOREACH_OPR_CLASS(MEGDNN_INST_CREATE_OPERATOR) | |||
#pragma GCC diagnostic pop | |||
} // namespace atlas | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,63 @@ | |||
/** | |||
* \file dnn/src/atlas/handle.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megcore_atlas.h" | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/handle.h" | |||
#include "megdnn/oprs/general.h" | |||
#include "src/common/handle_impl.h" | |||
#include "src/common/megcore/common/device_context.hpp" | |||
#include "src/common/utils.h" | |||
#include "src/atlas/megcore/device_context.hpp" | |||
#include <atomic> | |||
#include <mutex> | |||
#include "acl/acl_rt.h" | |||
namespace megdnn { | |||
namespace atlas { | |||
class HandleImpl : public HandleImplHelper { | |||
public: | |||
HandleImpl(megcoreComputingHandle_t computing_handle); | |||
~HandleImpl() noexcept; | |||
size_t alignment_requirement() const override; | |||
template <typename Opr> | |||
std::unique_ptr<Opr> create_operator(); | |||
const megcore::AtlasContext& megcore_context() const { | |||
return m_megcore_context; | |||
} | |||
int device_id() const { return m_device_id; } | |||
aclrtStream stream() const { return megcore_context().stream; } | |||
//! global matmul opr | |||
Checksum* checksum_opr() override final { | |||
return get_helper_opr<Checksum, 0>(this); | |||
} | |||
private: | |||
int m_device_id; | |||
//! MegDNN handle does not manage the lifetime of cnrt queue. | |||
megcore::AtlasContext m_megcore_context; | |||
}; | |||
} // namespace atlas | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,71 @@ | |||
/** | |||
* \file dnn/src/atlas/megcore/atlas_computing_context.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megcore.h" | |||
#include "src/atlas//megcore/computing_context.hpp" | |||
#include "src/atlas/utils.h" | |||
#include "src/common/utils.h" | |||
using namespace megcore; | |||
using namespace megcore::atlas; | |||
AtlasComputingContext::AtlasComputingContext(megcoreDeviceHandle_t dev_handle, | |||
unsigned int flags, | |||
const AtlasContext& ctx) | |||
: ComputingContext(dev_handle, flags), | |||
m_own_stream{ctx.stream == nullptr}, | |||
m_ctx{ctx} { | |||
megcorePlatform_t platform; | |||
megcoreGetPlatform(dev_handle, &platform); | |||
megdnn_assert(platform == megcorePlatformAtlas); | |||
if (m_own_stream) { | |||
acl_check(aclrtCreateStream(&m_ctx.stream)); | |||
} | |||
} | |||
AtlasComputingContext::~AtlasComputingContext() { | |||
if (m_own_stream) { | |||
acl_check(aclrtDestroyStream(m_ctx.stream)); | |||
} | |||
} | |||
void AtlasComputingContext::memcpy(void* dst, const void* src, | |||
size_t size_in_bytes, | |||
megcoreMemcpyKind_t kind) { | |||
aclrtMemcpyKind atlas_kind; | |||
switch (kind) { | |||
case megcoreMemcpyDeviceToHost: | |||
atlas_kind = ACL_MEMCPY_DEVICE_TO_HOST; | |||
break; | |||
case megcoreMemcpyHostToDevice: | |||
atlas_kind = ACL_MEMCPY_HOST_TO_DEVICE; | |||
break; | |||
case megcoreMemcpyDeviceToDevice: | |||
atlas_kind = ACL_MEMCPY_DEVICE_TO_DEVICE; | |||
break; | |||
default: | |||
megdnn_throw("bad atlas memcpy kind"); | |||
} | |||
acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes, | |||
atlas_kind, m_ctx.stream)); | |||
} | |||
void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) { | |||
acl_check(aclrtSynchronizeStream(m_ctx.stream)); | |||
acl_check(aclrtMemset(dst, size_in_bytes, value, size_in_bytes)); | |||
} | |||
void AtlasComputingContext::synchronize() { | |||
acl_check(aclrtSynchronizeStream(m_ctx.stream)); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,45 @@ | |||
/** | |||
* \file dnn/src/atlas/megcore/computing_context.hpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megcore_atlas.h" | |||
#include "src/common/megcore/common/computing_context.hpp" | |||
#include <acl/acl_rt.h> | |||
namespace megcore { | |||
namespace atlas { | |||
class AtlasComputingContext final : public ComputingContext { | |||
public: | |||
AtlasComputingContext(megcoreDeviceHandle_t dev_handle, unsigned int flags, | |||
const AtlasContext& ctx = {}); | |||
~AtlasComputingContext(); | |||
void memcpy(void* dst, const void* src, size_t size_in_bytes, | |||
megcoreMemcpyKind_t kind) override; | |||
void memset(void* dst, int value, size_t size_in_bytes) override; | |||
void synchronize() override; | |||
const AtlasContext& context() const { return m_ctx; } | |||
aclrtStream stream() const { return context().stream; } | |||
private: | |||
bool m_own_stream; | |||
AtlasContext m_ctx; | |||
}; | |||
} // namespace atlas | |||
} // namespace megcore | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,67 @@ | |||
/** | |||
* \file dnn/src/atlas/megcore/device_context.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/atlas/megcore/device_context.hpp" | |||
#include "megcore.h" | |||
#include "src/atlas/utils.h" | |||
#include "src/common/utils.h" | |||
#include "acl/acl.h" | |||
using namespace megcore; | |||
using namespace atlas; | |||
AtlasDeviceContext::AtlasDeviceContext(int device_id, unsigned int flags, | |||
bool global_initialized) | |||
: DeviceContext(megcorePlatformAtlas, device_id, flags) { | |||
if (!global_initialized) | |||
init_status.init(); | |||
int id = device_id; | |||
if (id < 0) { | |||
acl_check(aclrtGetDevice(&id)); | |||
} | |||
} | |||
AtlasDeviceContext::~AtlasDeviceContext() noexcept = default; | |||
size_t AtlasDeviceContext::mem_alignment_in_bytes() const noexcept { | |||
return 64; | |||
} | |||
void AtlasDeviceContext::activate() { | |||
int id = device_id(); | |||
if (id >= 0) { | |||
acl_check(aclrtSetDevice(id)); | |||
} | |||
} | |||
void AtlasDeviceContext::deactivate() { | |||
int id = device_id(); | |||
megdnn_assert(id >= 0); | |||
acl_check(aclrtResetDevice(id)); | |||
} | |||
void* AtlasDeviceContext::malloc(size_t size_in_bytes) { | |||
void* ptr; | |||
acl_check(aclrtMalloc(&ptr, size_in_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); | |||
return ptr; | |||
} | |||
void AtlasDeviceContext::free(void* ptr) { | |||
acl_check(aclrtFree(ptr)); | |||
} | |||
AtlasDeviceContext::InitStatus AtlasDeviceContext::init_status; | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,64 @@ | |||
/** | |||
* \file dnn/src/atlas/megcore/device_context.hpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "src/common/megcore/common/device_context.hpp" | |||
#include "src/common/utils.h" | |||
#include "megcore_atlas.h" | |||
#include <mutex> | |||
#include "acl/acl.h" | |||
namespace megcore { | |||
namespace atlas { | |||
class AtlasDeviceContext : public DeviceContext { | |||
public: | |||
AtlasDeviceContext(int device_id, unsigned int flags, | |||
bool global_initialized = false); | |||
~AtlasDeviceContext() noexcept; | |||
size_t mem_alignment_in_bytes() const noexcept override; | |||
void activate() override; | |||
void deactivate() override; | |||
void* malloc(size_t size_in_bytes) override; | |||
void free(void* ptr) override; | |||
struct InitStatus { | |||
bool initialized; | |||
std::mutex mtx; | |||
InitStatus() : initialized{false} {} | |||
void init() { | |||
std::lock_guard<std::mutex> guard{mtx}; | |||
if (!initialized) { | |||
auto err = aclInit(nullptr); | |||
initialized = err == ACL_ERROR_NONE; | |||
megdnn_assert(initialized, | |||
"aclrt initialize failed: (acl:%d): %s", | |||
static_cast<int>(err), | |||
megcore::atlas::get_error_str(err)); | |||
} | |||
} | |||
~InitStatus() { | |||
if (initialized) { | |||
initialized = false; | |||
} | |||
} | |||
}; | |||
static InitStatus init_status; | |||
}; | |||
} // namespace atlas | |||
} // namespace megcore | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,131 @@ | |||
/** | |||
* \file dnn/src/atlas/megcore/public_api/computing.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megcore_atlas.h" | |||
#include "src/atlas/megcore/computing_context.hpp" | |||
#include "src/atlas/megcore/device_context.hpp" | |||
#include "src/common/megcore/public_api/computing.hpp" | |||
#include "src/common/megcore/public_api/device.hpp" | |||
#include "src/common/utils.h" | |||
using namespace megcore; | |||
megcoreStatus_t megcore::createAtlasDeviceHandleWithGlobalInitStatus( | |||
megcoreDeviceHandle_t* devHandle, int deviceID, unsigned int flags, | |||
bool global_initialized) { | |||
auto content = megdnn::make_unique<atlas::AtlasDeviceContext>( | |||
deviceID, flags, global_initialized); | |||
auto& ctx = *devHandle; | |||
ctx = new megcoreDeviceContext; | |||
ctx->content = std::move(content); | |||
return megcoreSuccess; | |||
} | |||
megcoreStatus_t megcore::createComputingHandleWithAtlasContext( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, const AtlasContext& ctx) { | |||
MEGDNN_MARK_USED_VAR(flags); | |||
megdnn_assert(flags == 0); | |||
auto content = megdnn::make_unique<atlas::AtlasComputingContext>( | |||
devHandle, flags, ctx); | |||
auto& H = *compHandle; | |||
H = new megcoreComputingContext; | |||
H->content = std::move(content); | |||
return megcoreSuccess; | |||
} | |||
megcoreStatus_t megcore::getAtlasContext(megcoreComputingHandle_t handle, | |||
AtlasContext* ctx) { | |||
auto&& H = handle; | |||
megdnn_assert(H); | |||
megcoreDeviceHandle_t dev_handle = H->content->dev_handle(); | |||
megcorePlatform_t platform; | |||
megcoreGetPlatform(dev_handle, &platform); | |||
megdnn_assert(platform == megcorePlatformAtlas); | |||
auto context = static_cast<megcore::atlas::AtlasComputingContext*>( | |||
H->content.get()); | |||
*ctx = context->context(); | |||
return megcoreSuccess; | |||
} | |||
const char* megcore::atlas::get_error_str(aclError error) { | |||
#define ERROR(_err) \ | |||
case _err: \ | |||
return #_err; | |||
switch (error) { | |||
ERROR(ACL_ERROR_NONE); | |||
ERROR(ACL_ERROR_INVALID_PARAM); | |||
ERROR(ACL_ERROR_UNINITIALIZE); | |||
ERROR(ACL_ERROR_REPEAT_INITIALIZE); | |||
ERROR(ACL_ERROR_INVALID_FILE); | |||
ERROR(ACL_ERROR_WRITE_FILE); | |||
ERROR(ACL_ERROR_INVALID_FILE_SIZE); | |||
ERROR(ACL_ERROR_PARSE_FILE); | |||
ERROR(ACL_ERROR_FILE_MISSING_ATTR); | |||
ERROR(ACL_ERROR_FILE_ATTR_INVALID); | |||
ERROR(ACL_ERROR_INVALID_DUMP_CONFIG); | |||
ERROR(ACL_ERROR_INVALID_PROFILING_CONFIG); | |||
ERROR(ACL_ERROR_INVALID_MODEL_ID); | |||
ERROR(ACL_ERROR_DESERIALIZE_MODEL); | |||
ERROR(ACL_ERROR_PARSE_MODEL); | |||
ERROR(ACL_ERROR_READ_MODEL_FAILURE); | |||
ERROR(ACL_ERROR_MODEL_SIZE_INVALID); | |||
ERROR(ACL_ERROR_MODEL_MISSING_ATTR); | |||
ERROR(ACL_ERROR_MODEL_INPUT_NOT_MATCH); | |||
ERROR(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); | |||
ERROR(ACL_ERROR_MODEL_NOT_DYNAMIC); | |||
ERROR(ACL_ERROR_OP_TYPE_NOT_MATCH); | |||
ERROR(ACL_ERROR_OP_INPUT_NOT_MATCH); | |||
ERROR(ACL_ERROR_OP_OUTPUT_NOT_MATCH); | |||
ERROR(ACL_ERROR_OP_ATTR_NOT_MATCH); | |||
ERROR(ACL_ERROR_OP_NOT_FOUND); | |||
ERROR(ACL_ERROR_OP_LOAD_FAILED); | |||
ERROR(ACL_ERROR_UNSUPPORTED_DATA_TYPE); | |||
ERROR(ACL_ERROR_FORMAT_NOT_MATCH); | |||
ERROR(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); | |||
ERROR(ACL_ERROR_KERNEL_NOT_FOUND); | |||
ERROR(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); | |||
ERROR(ACL_ERROR_KERNEL_ALREADY_REGISTERED); | |||
ERROR(ACL_ERROR_INVALID_QUEUE_ID); | |||
ERROR(ACL_ERROR_REPEAT_SUBSCRIBE); | |||
ERROR(ACL_ERROR_STREAM_NOT_SUBSCRIBE); | |||
ERROR(ACL_ERROR_THREAD_NOT_SUBSCRIBE); | |||
ERROR(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); | |||
ERROR(ACL_ERROR_REPEAT_FINALIZE); | |||
ERROR(ACL_ERROR_NOT_STATIC_AIPP); | |||
ERROR(ACL_ERROR_BAD_ALLOC); | |||
ERROR(ACL_ERROR_API_NOT_SUPPORT); | |||
ERROR(ACL_ERROR_INVALID_DEVICE); | |||
ERROR(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); | |||
ERROR(ACL_ERROR_RESOURCE_NOT_MATCH); | |||
ERROR(ACL_ERROR_INVALID_RESOURCE_HANDLE); | |||
ERROR(ACL_ERROR_FEATURE_UNSUPPORTED); | |||
ERROR(ACL_ERROR_STORAGE_OVER_LIMIT); | |||
ERROR(ACL_ERROR_INTERNAL_ERROR); | |||
ERROR(ACL_ERROR_FAILURE); | |||
ERROR(ACL_ERROR_GE_FAILURE); | |||
ERROR(ACL_ERROR_RT_FAILURE); | |||
ERROR(ACL_ERROR_DRV_FAILURE); | |||
ERROR(ACL_ERROR_PROFILING_FAILURE); | |||
default: | |||
return "unknown error"; | |||
} | |||
#undef ERROR | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,26 @@ | |||
/** | |||
* \file dnn/src/atlas/utils.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/atlas/utils.h" | |||
#include "megcore_atlas.h" | |||
#include "src/common/utils.h" | |||
using namespace megdnn; | |||
using namespace atlas; | |||
void atlas::__throw_acl_error__(aclError err, const char* msg) { | |||
auto s = ssprintf("acl return %s(%d) occurred; expr: %s", | |||
megcore::atlas::get_error_str(err), int(err), msg); | |||
megdnn_throw(s.c_str()); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file dnn/src/atlas/utils.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megdnn/handle.h" | |||
#include "src/atlas/handle.h" | |||
#include <acl/acl_base.h> | |||
#define acl_check(_x) \ | |||
do { \ | |||
aclError _ret = (_x); \ | |||
if (_ret != ACL_ERROR_NONE) { \ | |||
::megdnn::atlas::__throw_acl_error__(_ret, #_x); \ | |||
} \ | |||
} while (0) | |||
namespace megdnn { | |||
namespace atlas { | |||
inline HandleImpl* concrete_handle(Handle* handle) { | |||
return static_cast<atlas::HandleImpl*>(handle); | |||
} | |||
//! Error handling funcions | |||
MEGDNN_NORETURN void __throw_acl_error__(aclError err, const char* msg); | |||
} // namespace atlas | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,31 @@ | |||
load("//brain/megbrain/dnn:flags.bzl", "megdnn_opts") | |||
load("@megvii3//tools/build_rules:bangc.bzl", "bangc_library") | |||
package(default_visibility = ["//brain/megbrain/dnn:__subpackages__"]) | |||
bangc_library( | |||
name = "bangc_kernels", | |||
srcs = glob([ | |||
"**/*.mlu", | |||
]) + [ | |||
"//brain/megbrain/dnn:src/common/utils.cuh", | |||
], | |||
hdrs = glob([ | |||
"**/*.mlu.h", | |||
]), | |||
deps = [ | |||
"//brain/megbrain/dnn:public_headers", | |||
], | |||
copts = megdnn_opts + [ | |||
"-Ibrain/megbrain/dnn", | |||
], | |||
) | |||
filegroup( | |||
name = "cambricon_backend_files", | |||
srcs = glob([ | |||
"**/*.cpp", | |||
"**/*.h", | |||
"**/*.hpp", | |||
]), | |||
) |
@@ -0,0 +1,27 @@ | |||
/** | |||
* \file dnn/src/cambricon/checksum/checksum.mlu.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "src/cambricon/utils.mlu.h" | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
void checksum_kernel_union1(uint32_t* dst, const uint32_t* src, int num_elems); | |||
void checksum_kernel_union4(uint32_t* dst, const uint32_t* src, int num_elems); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
// vim: ft=cpp syntax=cpp.doxygen | |||
@@ -0,0 +1,61 @@ | |||
/** | |||
* \file dnn/src/cambricon/checksum/checksum_kernel_union1.mlu | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "checksum.mlu.h" | |||
#include "cnsccl.h" | |||
#include "mlu.h" | |||
#define CLUSTER_DIM 1 | |||
#define CORE_DIM 4 | |||
#define STRIDE 1024 | |||
__mlu_entry__ void checksum_kernel_union1(uint32_t* dst, uint32_t* src, | |||
int nr_elems) { | |||
__nram__ uint32_t sum = 0; | |||
__nram__ uint32_t val[STRIDE]; | |||
const uint32_t TASK_DIM = CLUSTER_DIM * CORE_DIM; | |||
__mlu_shared__ uint32_t partial_sum[TASK_DIM]; | |||
int task_stride = STRIDE; | |||
int start_offset = taskId * task_stride; | |||
int global_stride = taskDim * task_stride; | |||
for (int task_offset = start_offset; task_offset < nr_elems; | |||
task_offset += global_stride) { | |||
int end_offset = task_offset + task_stride; | |||
end_offset = end_offset > nr_elems ? nr_elems : end_offset; | |||
int copy_elems = end_offset - task_offset; | |||
__memcpy(val, src + task_offset, copy_elems * sizeof(uint32_t), | |||
GDRAM2NRAM); | |||
for (int i = 0; i < copy_elems; i++) { | |||
sum = sum + val[i] * (task_offset + i + 1); | |||
} | |||
} | |||
partial_sum[taskId] = sum; | |||
__sync_cluster(); | |||
if (taskId == 0) { | |||
uint32_t res = 0; | |||
for (int i = 0; i < taskDim; i++) { | |||
res += partial_sum[i]; | |||
} | |||
dst[0] = res; | |||
} | |||
} | |||
#undef CLUSTER_DIM | |||
#undef CORE_DIM | |||
#undef STRIDE | |||
// vim: ft=cpp syntax=cpp.doxygen | |||
@@ -0,0 +1,71 @@ | |||
/** | |||
* \file dnn/src/cambricon/checksum/checksum_kernel_union4.mlu | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "checksum.mlu.h" | |||
#include "cnsccl.h" | |||
#include "mlu.h" | |||
#define CLUSTER_DIM 4 | |||
#define CORE_DIM 4 | |||
#define STRIDE 1024 | |||
__mlu_entry__ void checksum_kernel_union4(uint32_t* dst, uint32_t* src, | |||
int nr_elems) { | |||
__nram__ uint32_t sum = 0; | |||
__nram__ uint32_t val[STRIDE]; | |||
__mlu_shared__ uint32_t partial_sum_send[CORE_DIM]; | |||
__mlu_shared__ uint32_t partial_sum_recv[CLUSTER_DIM]; | |||
int task_stride = STRIDE; | |||
int start_offset = taskId * task_stride; | |||
int global_stride = taskDim * task_stride; | |||
for (int task_offset = start_offset; task_offset < nr_elems; | |||
task_offset += global_stride) { | |||
int end_offset = task_offset + task_stride; | |||
end_offset = end_offset > nr_elems ? nr_elems : end_offset; | |||
int copy_elems = end_offset - task_offset; | |||
__memcpy(val, src + task_offset, copy_elems * sizeof(uint32_t), | |||
GDRAM2NRAM); | |||
for (int i = 0; i < copy_elems; i++) { | |||
sum = sum + val[i] * (task_offset + i + 1); | |||
} | |||
} | |||
partial_sum_send[coreId] = sum; | |||
__sync_cluster(); | |||
if (coreId == 0) { | |||
for (int i = 1; i < CORE_DIM; ++i) { | |||
partial_sum_send[0] += partial_sum_send[i]; | |||
} | |||
} | |||
__sync_all(); | |||
cnscclGather((void*)&partial_sum_send, (void*)&partial_sum_recv, 1, | |||
cnscclInt, 0); | |||
__sync_all(); | |||
if (clusterId == 0 && coreId == 0) { | |||
uint32_t res = 0; | |||
for (int i = 0; i < CLUSTER_DIM; ++i) { | |||
res += partial_sum_recv[i]; | |||
} | |||
dst[0] = res; | |||
} | |||
} | |||
#undef CLUSTER_DIM | |||
#undef CORE_DIM | |||
#undef STRIDE | |||
// vim: ft=cpp syntax=cpp.doxygen | |||
@@ -0,0 +1,85 @@ | |||
/** | |||
* \file dnn/src/cambricon/checksum/opr_impl.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/cambricon/checksum/checksum.mlu.h" | |||
#include "src/cambricon/checksum/opr_impl.h" | |||
#include "src/cambricon/utils.h" | |||
#include <algorithm> | |||
using namespace megdnn; | |||
using namespace cambricon; | |||
namespace { | |||
void bang_c_wrapper(uint32_t* dst, const uint32_t* src, int nr_elems, | |||
cnrtQueue_t queue, cnrtCoreVersion_t core_version) { | |||
cnrtKernelParamsBuffer_t params; | |||
cnrt_check(cnrtGetKernelParamsBuffer(¶ms)); | |||
cnrt_check(cnrtKernelParamsBufferAddParam(params, &dst, sizeof(uint32_t*))); | |||
cnrt_check(cnrtKernelParamsBufferAddParam(params, &src, sizeof(uint32_t*))); | |||
cnrt_check(cnrtKernelParamsBufferAddParam(params, &nr_elems, sizeof(int))); | |||
if (core_version == CNRT_MLU270) { | |||
cnrtDim3_t dim; | |||
dim.x = 16; | |||
dim.y = 1; | |||
dim.z = 1; | |||
cnrtFunctionType_t c = CNRT_FUNC_TYPE_UNION4; | |||
cnrt_check(cnrtInvokeKernel_V2((void*)&checksum_kernel_union4, dim, | |||
params, c, queue)); | |||
} else if (core_version == CNRT_MLU220) { | |||
cnrtDim3_t dim; | |||
dim.x = 4; | |||
dim.y = 1; | |||
dim.z = 1; | |||
cnrtFunctionType_t c = CNRT_FUNC_TYPE_UNION1; | |||
cnrt_check(cnrtInvokeKernel_V2((void*)&checksum_kernel_union1, dim, | |||
params, c, queue)); | |||
} | |||
after_kernel_launch(); | |||
cnrt_check(cnrtDestroyKernelParamsBuffer(params)); | |||
} | |||
} // namespace | |||
size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout& /* data */) { | |||
size_t ws_size = sizeof(ChecksumForward::Result::checksum); | |||
return ws_size; | |||
} | |||
ChecksumForward::Result ChecksumForwardImpl::exec(_megdnn_tensor_in data, | |||
_megdnn_workspace workspace) { | |||
Result result; | |||
memset(&result, 0, sizeof(result)); | |||
check_exec(data.layout, workspace.size); | |||
auto queue = cnrt_queue(handle()); | |||
auto ptr = static_cast<uint8_t*>(data.raw_ptr); | |||
size_t size_all = data.layout.shape[0], | |||
size_ints = size_all / sizeof(uint32_t); | |||
auto last_val_size = std::min<size_t>(size_all, 4); | |||
cnrt_check(cnrtMemcpyAsync(&result.last_val, ptr + size_all - last_val_size, | |||
last_val_size, queue, | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
if (size_ints) { | |||
auto&& device_info = current_device_info(); | |||
bang_c_wrapper(reinterpret_cast<uint32_t*>(workspace.raw_ptr), | |||
static_cast<uint32_t*>(data.raw_ptr), size_ints, queue, | |||
device_info.core_version); | |||
cnrt_check(cnrtMemcpyAsync(&result.checksum, workspace.raw_ptr, | |||
sizeof(result.checksum), queue, | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
} | |||
cnrt_check(cnrtSyncQueue(queue)); | |||
return result; | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,36 @@ | |||
/** | |||
* \file dnn/src/cambricon/checksum/opr_impl.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "megdnn/oprs.h" | |||
#include "src/cambricon/utils.h" | |||
namespace megdnn { | |||
namespace cambricon { | |||
class ChecksumForwardImpl final : public ChecksumForward { | |||
public: | |||
using ChecksumForward::ChecksumForward; | |||
size_t get_workspace_in_bytes(const TensorLayout&) override; | |||
bool is_thread_safe() const override { return true; } | |||
Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) override; | |||
}; | |||
} // namespace cambricon | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,68 @@ | |||
/** | |||
* \file dnn/src/cambricon/handle.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/common/handle_impl.h" | |||
#include "src/common/version_symbol.h" | |||
#include "src/cambricon/handle.h" | |||
#include "src/cambricon/utils.h" | |||
#include "src/cambricon/checksum/opr_impl.h" | |||
#include <cnrt.h> | |||
namespace megdnn { | |||
namespace cambricon { | |||
HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) | |||
: HandleImplHelper(comp_handle, HandleType::CAMBRICON) { | |||
// Get megcore device handle | |||
megcoreDeviceHandle_t dev_handle; | |||
megcoreGetDeviceHandle(comp_handle, &dev_handle); | |||
int dev_id; | |||
megcoreGetDeviceID(dev_handle, &dev_id); | |||
unsigned int dev_num; | |||
cnrt_check(cnrtGetDeviceCount(&dev_num)); | |||
MEGDNN_MARK_USED_VAR(dev_num); | |||
// check validity of device_id | |||
megdnn_assert(dev_id >= 0 && static_cast<unsigned int>(dev_id) < dev_num); | |||
m_device_id = dev_id; | |||
cnrt_check(cnrtGetDeviceInfo(&m_device_info, dev_id)); | |||
megcore::getCambriconContext(comp_handle, &m_megcore_context); | |||
} | |||
HandleImpl::~HandleImpl() noexcept = default; | |||
template <typename Opr> | |||
std::unique_ptr<Opr> HandleImpl::create_operator() { | |||
megdnn_throw("unsupported cambricon opr"); | |||
return nullptr; | |||
} | |||
size_t HandleImpl::alignment_requirement() const { | |||
return 1; | |||
} | |||
MEGDNN_SPECIALIZE_CREATE_OPERATOR(ChecksumForward); | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wpragmas" | |||
#pragma GCC diagnostic ignored "-Winstantiation-after-specialization" | |||
MEGDNN_FOREACH_OPR_CLASS(MEGDNN_INST_CREATE_OPERATOR) | |||
#pragma GCC diagnostic pop | |||
} // namespace cambricon | |||
} // namespace megdnn | |||
MEGDNN_VERSION_SYMBOL3(CNRT, CNRT_MAJOR_VERSION, CNRT_MINOR_VERSION, | |||
CNRT_PATCH_VERSION); | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,65 @@ | |||
/** | |||
* \file dnn/src/cambricon/handle.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "megcore_cambricon.h" | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/handle.h" | |||
#include "megdnn/oprs/general.h" | |||
#include "src/common/handle_impl.h" | |||
#include "src/common/utils.h" | |||
#include <atomic> | |||
#include <mutex> | |||
#include <cnrt.h> | |||
namespace megdnn { | |||
namespace cambricon { | |||
class HandleImpl : public HandleImplHelper { | |||
public: | |||
HandleImpl(megcoreComputingHandle_t computing_handle); | |||
~HandleImpl() noexcept; | |||
size_t alignment_requirement() const override; | |||
const cnrtDeviceInfo_t& device_info() const { return m_device_info; } | |||
template <typename Opr> | |||
std::unique_ptr<Opr> create_operator(); | |||
const megcore::CambriconContext& megcore_context() const { | |||
return m_megcore_context; | |||
} | |||
int device_id() const { return m_device_id; } | |||
cnrtQueue_t queue() const { return megcore_context().queue; } | |||
//! global matmul opr | |||
Checksum* checksum_opr() override final { | |||
return get_helper_opr<Checksum, 0>(this); | |||
} | |||
private: | |||
int m_device_id; | |||
//! MegDNN handle does not manage the lifetime of cnrt queue. | |||
megcore::CambriconContext m_megcore_context; | |||
cnrtDeviceInfo_t m_device_info; | |||
}; | |||
} // namespace cambricon | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,78 @@ | |||
/** | |||
* \file dnn/src/cambricon/megcore/cambricon_computing_context.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megcore.h" | |||
#include "src/cambricon/utils.h" | |||
#include "src/common/utils.h" | |||
#include "src/cambricon/megcore/cambricon_computing_context.hpp" | |||
using namespace megcore; | |||
using namespace megcore::cambricon; | |||
CambriconComputingContext::CambriconComputingContext( | |||
megcoreDeviceHandle_t dev_handle, unsigned int flags, | |||
const CambriconContext& ctx) | |||
: ComputingContext(dev_handle, flags), | |||
own_queue{ctx.queue == nullptr}, | |||
context_{ctx} { | |||
megcorePlatform_t platform; | |||
megcoreGetPlatform(dev_handle, &platform); | |||
megdnn_assert(platform == megcorePlatformCambricon); | |||
if (own_queue) { | |||
cnrt_check(cnrtCreateQueue(&context_.queue)); | |||
} | |||
} | |||
CambriconComputingContext::~CambriconComputingContext() { | |||
if (own_queue) { | |||
cnrt_check(cnrtDestroyQueue(context_.queue)); | |||
} | |||
} | |||
void CambriconComputingContext::memcpy(void* dst, const void* src, | |||
size_t size_in_bytes, | |||
megcoreMemcpyKind_t kind) { | |||
cnrtMemTransDir_t dir; | |||
switch (kind) { | |||
case megcoreMemcpyDeviceToHost: | |||
dir = CNRT_MEM_TRANS_DIR_DEV2HOST; | |||
break; | |||
case megcoreMemcpyHostToDevice: | |||
dir = CNRT_MEM_TRANS_DIR_HOST2DEV; | |||
break; | |||
case megcoreMemcpyDeviceToDevice: | |||
dir = CNRT_MEM_TRANS_DIR_DEV2DEV; | |||
break; | |||
default: | |||
megdnn_throw(megdnn_mangle("bad cnrt mem trans dir")); | |||
} | |||
if (kind == megcoreMemcpyDeviceToDevice) { | |||
cnrt_check(cnrtSyncQueue(context_.queue)); | |||
cnrt_check(cnrtMemcpy(dst, const_cast<void*>(src), size_in_bytes, dir)); | |||
return; | |||
} | |||
cnrt_check(cnrtMemcpyAsync(dst, const_cast<void*>(src), size_in_bytes, | |||
context_.queue, dir)); | |||
} | |||
void CambriconComputingContext::memset(void* dst, int value, | |||
size_t size_in_bytes) { | |||
cnrt_check(cnrtSyncQueue(context_.queue)); | |||
cnrt_check(cnrtMemset(dst, value, size_in_bytes)); | |||
} | |||
void CambriconComputingContext::synchronize() { | |||
cnrt_check(cnrtSyncQueue(context_.queue)); | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,44 @@ | |||
/** | |||
* \file dnn/src/cambricon/megcore/cambricon_computing_context.hpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "megcore_cambricon.h" | |||
#include "src/common/megcore/common/computing_context.hpp" | |||
namespace megcore { | |||
namespace cambricon { | |||
class CambriconComputingContext final : public ComputingContext { | |||
public: | |||
CambriconComputingContext(megcoreDeviceHandle_t dev_handle, | |||
unsigned int flags, | |||
const CambriconContext& ctx = {}); | |||
~CambriconComputingContext(); | |||
void memcpy(void* dst, const void* src, size_t size_in_bytes, | |||
megcoreMemcpyKind_t kind) override; | |||
void memset(void* dst, int value, size_t size_in_bytes) override; | |||
void synchronize() override; | |||
const CambriconContext& context() const { return context_; } | |||
cnrtQueue_t queue() const { return context().queue; } | |||
private: | |||
bool own_queue; | |||
CambriconContext context_; | |||
}; | |||
} // namespace cambricon | |||
} // namespace megcore | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,80 @@ | |||
/** | |||
* \file dnn/src/cambricon/megcore/cambricon_device_context.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megcore.h" | |||
#include "src/cambricon/utils.h" | |||
#include "src/common/utils.h" | |||
#include "src/cambricon/megcore/cambricon_device_context.hpp" | |||
#define STR_HELPER(x) #x | |||
#define STR(x) STR_HELPER(x) | |||
#define CNRT_VERSION_STR \ | |||
STR(CNRT_MAJOR_VERSION) \ | |||
"." STR(CNRT_MINOR_VERSION) "." STR(CNRT_PATCH_VERSION) | |||
#pragma message "compile with cnrt " CNRT_VERSION_STR " " | |||
#undef STR_HELPER | |||
#undef STR | |||
using namespace megcore; | |||
using namespace cambricon; | |||
CambriconDeviceContext::CambriconDeviceContext(int device_id, | |||
unsigned int flags, | |||
bool global_initialized) | |||
: DeviceContext(megcorePlatformCambricon, device_id, flags) { | |||
if (!global_initialized) | |||
init_status.init(); | |||
unsigned int version; | |||
cnrt_check(cnrtGetVersion(&version)); | |||
megdnn_assert(version == CNRT_VERSION, | |||
"megcore compiled with cnrt %d, get %d at runtime", | |||
CNRT_VERSION, version); | |||
unsigned int dev_num; | |||
cnrt_check(cnrtGetDeviceCount(&dev_num)); | |||
MEGDNN_MARK_USED_VAR(dev_num); | |||
// check validity of device_id | |||
megdnn_assert(device_id >= 0 && | |||
static_cast<unsigned int>(device_id) < dev_num); | |||
cnrt_check(cnrtGetDeviceInfo(&device_info, device_id)); | |||
} | |||
CambriconDeviceContext::~CambriconDeviceContext() noexcept = default; | |||
size_t CambriconDeviceContext::mem_alignment_in_bytes() const noexcept { | |||
return 1; | |||
} | |||
void CambriconDeviceContext::activate() { | |||
int id = device_id(); | |||
cnrtDev_t dev; | |||
cnrt_check(cnrtGetDeviceHandle(&dev, id)); | |||
cnrt_check(cnrtSetCurrentDevice(dev)); | |||
} | |||
void* CambriconDeviceContext::malloc(size_t size_in_bytes) { | |||
void* ptr; | |||
cnrt_check(cnrtMalloc(&ptr, size_in_bytes)); | |||
return ptr; | |||
} | |||
void CambriconDeviceContext::free(void* ptr) { | |||
cnrt_check(cnrtFree(ptr)); | |||
} | |||
CambriconDeviceContext::InitStatus CambriconDeviceContext::init_status; | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,63 @@ | |||
/** | |||
* \file dnn/src/cambricon/megcore/cambricon_device_context.hpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include <mutex> | |||
#include "megcore_cambricon.h" | |||
#include "src/common/megcore/common/device_context.hpp" | |||
#include "src/common/utils.h" | |||
namespace megcore { | |||
namespace cambricon { | |||
class CambriconDeviceContext : public DeviceContext { | |||
public: | |||
CambriconDeviceContext(int device_id, unsigned int flags, | |||
bool global_initialized = false); | |||
~CambriconDeviceContext() noexcept; | |||
size_t mem_alignment_in_bytes() const noexcept override; | |||
void activate() override; | |||
void* malloc(size_t size_in_bytes) override; | |||
void free(void* ptr) override; | |||
struct InitStatus { | |||
bool initialized; | |||
std::mutex mtx; | |||
InitStatus() : initialized{false} {} | |||
void init() { | |||
std::lock_guard<std::mutex> guard{mtx}; | |||
if (!initialized) { | |||
auto cnrt_err = cnrtInit(0); | |||
initialized = cnrt_err == CNRT_RET_SUCCESS; | |||
megdnn_assert(initialized, "cnrt initialize failed: (cnrt:%d)", | |||
static_cast<int>(cnrt_err)); | |||
} | |||
} | |||
~InitStatus() { | |||
if (initialized) { | |||
cnrtDestroy(); | |||
initialized = false; | |||
} | |||
} | |||
}; | |||
static InitStatus init_status; | |||
private: | |||
cnrtDeviceInfo_t device_info; | |||
}; | |||
} // namespace cambricon | |||
} // namespace megcore | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,59 @@ | |||
/** | |||
* \file dnn/src/cambricon/megcore/public_api/computing.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megcore_cambricon.h" | |||
#include "src/cambricon/megcore/cambricon_computing_context.hpp" | |||
#include "src/cambricon/megcore/cambricon_device_context.hpp" | |||
#include "src/common/megcore/public_api/computing.hpp" | |||
#include "src/common/megcore/public_api/device.hpp" | |||
#include "src/common/utils.h" | |||
using namespace megcore; | |||
megcoreStatus_t megcore::createDeviceHandleWithGlobalInitStatus( | |||
megcoreDeviceHandle_t* devHandle, int deviceID, unsigned int flags, | |||
bool global_initialized) { | |||
auto content = megdnn::make_unique<cambricon::CambriconDeviceContext>( | |||
deviceID, flags, global_initialized); | |||
auto& ctx = *devHandle; | |||
ctx = new megcoreDeviceContext; | |||
ctx->content = std::move(content); | |||
return megcoreSuccess; | |||
} | |||
megcoreStatus_t megcore::createComputingHandleWithCambriconContext( | |||
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||
unsigned int flags, const CambriconContext& ctx) { | |||
auto content = megdnn::make_unique<cambricon::CambriconComputingContext>( | |||
devHandle, flags, ctx); | |||
auto& H = *compHandle; | |||
H = new megcoreComputingContext; | |||
H->content = std::move(content); | |||
return megcoreSuccess; | |||
} | |||
megcoreStatus_t megcore::getCambriconContext(megcoreComputingHandle_t handle, | |||
CambriconContext* ctx) { | |||
auto&& H = handle; | |||
megdnn_assert(H); | |||
megcoreDeviceHandle_t dev_handle = H->content->dev_handle(); | |||
megcorePlatform_t platform; | |||
megcoreGetPlatform(dev_handle, &platform); | |||
megdnn_assert(platform == megcorePlatformCambricon); | |||
auto context = static_cast<megcore::cambricon::CambriconComputingContext*>( | |||
H->content.get()); | |||
*ctx = context->context(); | |||
return megcoreSuccess; | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,75 @@ | |||
/** | |||
* \file dnn/src/cambricon/utils.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/cambricon/utils.h" | |||
#include "src/cambricon/utils.mlu.h" | |||
#include "src/cambricon/handle.h" | |||
#include "src/common/utils.h" | |||
#include <mutex> | |||
#include <unordered_map> | |||
using namespace megdnn; | |||
using namespace cambricon; | |||
namespace { | |||
struct DeviceInfoRecord { | |||
bool init = false; | |||
cnrtDeviceInfo_t device_info; | |||
std::mutex mtx; | |||
}; | |||
std::unordered_map<cnrtDev_t, int> dev2device_id; | |||
std::mutex dev2device_id_mtx; | |||
constexpr int MAX_NR_DEVICE = 64; | |||
DeviceInfoRecord device_info_rec[MAX_NR_DEVICE]; | |||
} // namespace | |||
void cambricon::__throw_cnrt_error__(cnrtRet_t err, const char* msg) { | |||
auto s = ssprintf("cnrt return %s(%d) occurred; expr: %s", | |||
cnrtGetErrorStr(err), int(err), msg); | |||
megdnn_throw(s.c_str()); | |||
} | |||
cnrtDeviceInfo_t cambricon::current_device_info() { | |||
static bool dev2device_id_init = false; | |||
{ | |||
std::lock_guard<std::mutex> lock(dev2device_id_mtx); | |||
if (!dev2device_id_init) { | |||
unsigned int dev_num = 0; | |||
cnrt_check(cnrtGetDeviceCount(&dev_num)); | |||
for (unsigned int dev_id = 0; dev_id < dev_num; ++dev_id) { | |||
cnrtDev_t dev; | |||
cnrt_check(cnrtGetDeviceHandle(&dev, dev_id)); | |||
dev2device_id[dev] = dev_id; | |||
} | |||
dev2device_id_init = true; | |||
} | |||
} | |||
cnrtDev_t dev; | |||
cnrt_check(cnrtGetCurrentDevice(&dev)); | |||
{ | |||
std::lock_guard<std::mutex> lock(dev2device_id_mtx); | |||
int dev_id = dev2device_id.at(dev); | |||
auto& rec = device_info_rec[dev_id]; | |||
{ | |||
std::lock_guard<std::mutex> lock(rec.mtx); | |||
if (!rec.init) { | |||
cnrt_check(cnrtGetDeviceInfo(&rec.device_info, dev_id)); | |||
rec.init = true; | |||
} | |||
} | |||
return rec.device_info; | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file dnn/src/cambricon/utils.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "megcore_cdefs.h" | |||
#include "megdnn/handle.h" | |||
#include "src/cambricon/utils.mlu.h" | |||
#include "src/common/utils.h" | |||
#include "src/cambricon/handle.h" | |||
#include <cnrt.h> | |||
namespace megdnn { | |||
namespace cambricon { | |||
static inline HandleImpl* concrete_handle(Handle* handle) { | |||
return static_cast<cambricon::HandleImpl*>(handle); | |||
} | |||
static inline cnrtQueue_t cnrt_queue(Handle* handle) { | |||
return concrete_handle(handle)->queue(); | |||
} | |||
//! get device info of current active device | |||
cnrtDeviceInfo_t current_device_info(); | |||
} // namespace cambricon | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,42 @@ | |||
/** | |||
* \file dnn/src/cambricon/utils.mlu.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "src/common/utils.cuh" | |||
#include <stdint.h> | |||
#include <cnrt.h> | |||
#define cnrt_check(_x) \ | |||
do { \ | |||
cnrtRet_t _ret = (_x); \ | |||
if (_ret != CNRT_RET_SUCCESS) { \ | |||
::megdnn::cambricon::__throw_cnrt_error__(_ret, #_x); \ | |||
} \ | |||
} while (0) | |||
#define after_kernel_launch() \ | |||
do { \ | |||
cnrt_check(cnrtGetLastErr()); \ | |||
} while (0) | |||
namespace megdnn { | |||
namespace cambricon { | |||
//! Error handling funcions | |||
MEGDNN_NORETURN void __throw_cnrt_error__(cnrtRet_t err, const char* msg); | |||
} // namespace cambricon | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
@@ -36,6 +36,14 @@ | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
#include "src/cambricon/handle.h" | |||
#endif | |||
#ifdef MEGDNN_WITH_ATLAS | |||
#include "src/atlas/handle.h" | |||
#endif | |||
using namespace megdnn; | |||
MIDOUT_DECL(HandlePlatform); | |||
@@ -85,6 +93,20 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle, | |||
MIDOUT_END(); | |||
#endif | |||
} | |||
else if (platform == megcorePlatformCambricon) { | |||
#if MEGDNN_WITH_CAMBRICON | |||
return make_unique<cambricon::HandleImpl>(computing_handle); | |||
#else | |||
return nullptr; | |||
#endif | |||
} | |||
else if (platform == megcorePlatformAtlas) { | |||
#if MEGDNN_WITH_ATLAS | |||
return make_unique<atlas::HandleImpl>(computing_handle); | |||
#else | |||
return nullptr; | |||
#endif | |||
} | |||
else { | |||
// CUDA | |||
megdnn_assert_internal(platform == megcorePlatformCUDA); | |||
@@ -94,6 +116,7 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle, | |||
return nullptr; | |||
#endif | |||
} | |||
return nullptr; | |||
} | |||
@@ -167,6 +190,12 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle, | |||
#if MEGDNN_WITH_CUDA | |||
CASE(CUDA,cuda); | |||
#endif | |||
#if MEGDNN_WITH_ATLAS | |||
CASE(ATLAS, atlas); | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
CASE(CAMBRICON, cambricon); | |||
#endif | |||
default: | |||
megdnn_throw(megdnn_mangle("bad handle type")); | |||
} | |||
@@ -18,6 +18,14 @@ | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
#include "src/cambricon/megcore/cambricon_computing_context.hpp" | |||
#endif | |||
#if MEGDNN_WITH_ATLAS | |||
#include "src/atlas/megcore/computing_context.hpp" | |||
#endif | |||
using namespace megcore; | |||
using namespace megdnn; | |||
@@ -33,6 +41,15 @@ std::unique_ptr<ComputingContext> ComputingContext::make( | |||
case megcorePlatformCUDA: | |||
return make_unique<cuda::CUDAComputingContext>(dev_handle, flags); | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
case megcorePlatformCambricon: | |||
return make_unique<cambricon::CambriconComputingContext>(dev_handle, | |||
flags); | |||
#endif | |||
#if MEGDNN_WITH_ATLAS | |||
case megcorePlatformAtlas: | |||
return make_unique<atlas::AtlasComputingContext>(dev_handle, flags); | |||
#endif | |||
default: | |||
megdnn_throw("bad platform"); | |||
} | |||
@@ -15,6 +15,13 @@ | |||
#if MEGDNN_WITH_CUDA | |||
#include "src/cuda/megcore/cuda_device_context.hpp" | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
#include "src/cambricon/megcore/cambricon_device_context.hpp" | |||
#endif | |||
#if MEGDNN_WITH_ATLAS | |||
#include "src/atlas/megcore/device_context.hpp" | |||
#endif | |||
using namespace megcore; | |||
using namespace megdnn; | |||
@@ -29,6 +36,15 @@ std::unique_ptr<DeviceContext> DeviceContext::make(megcorePlatform_t platform, | |||
case megcorePlatformCUDA: | |||
return make_unique<cuda::CUDADeviceContext>(deviceID, flags); | |||
#endif | |||
#if MEGDNN_WITH_CAMBRICON | |||
case megcorePlatformCambricon: | |||
return make_unique<cambricon::CambriconDeviceContext>(deviceID, | |||
flags); | |||
#endif | |||
#if MEGDNN_WITH_ATLAS | |||
case megcorePlatformAtlas: | |||
return make_unique<atlas::AtlasDeviceContext>(deviceID, flags); | |||
#endif | |||
default: | |||
megdnn_throw("bad platform"); | |||
} | |||
@@ -26,6 +26,16 @@ if(MGE_WITH_CUDA) | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
file(GLOB_RECURSE SOURCES_ cambricon/*.cpp) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
endif() | |||
if(MGE_WITH_ATLAS) | |||
file(GLOB_RECURSE SOURCES_ atlas/*.cpp) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
endif() | |||
add_executable(megdnn_test ${SOURCES}) | |||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") | |||
@@ -0,0 +1,74 @@ | |||
/** | |||
* \file dnn/test/atlas/checksum.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megdnn/oprs.h" | |||
#include "test/atlas/fixture.h" | |||
#include "test/common/checker.h" | |||
using namespace megdnn; | |||
using namespace test; | |||
TEST_F(ATLAS, CHECKSUM_FORWARD) { | |||
auto atlas_opr = handle_atlas()->create_operator<megdnn::Checksum>(), | |||
naive_opr = handle_naive()->create_operator<megdnn::Checksum>(); | |||
std::mt19937 rng(std::random_device{}()); | |||
for (size_t size : | |||
{3, 8, 4 * 4 * 1024, 12345, 1024 * 1024, 1024 * 1024 * 10}) { | |||
auto aligned_size = size + ((512 - size % 512) % 512); | |||
auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { | |||
TensorND tensor; | |||
tensor.raw_ptr = ptr; | |||
tensor.layout.init_contiguous_stride({size}); | |||
tensor.layout.dtype = dtype::Byte(); | |||
WorkspaceWrapper workspace( | |||
handle_atlas(), opr->get_workspace_in_bytes(tensor.layout)); | |||
if (log_size) { | |||
printf("checksum(%zu): workspace=%zu\n", size, | |||
workspace.workspace().size); | |||
} | |||
return opr->exec(tensor, workspace.workspace()); | |||
}; | |||
std::vector<uint8_t> buf(aligned_size); | |||
for (size_t i = 0; i < size; ++i) | |||
buf[i] = 1; | |||
auto run_offsset = [&](size_t offset) { | |||
void* dev_ptr = megdnn_malloc(handle_atlas(), buf.size() + offset); | |||
void* dev_buf = static_cast<char*>(dev_ptr) + offset; | |||
Checksum::Result res_cambricon[2], res_naive[2]; | |||
for (int change_last = 0; change_last < 2; ++change_last) { | |||
if (change_last) | |||
++buf[size - 1]; | |||
megdnn_memcpy_H2D(handle_atlas(), dev_buf, buf.data(), size); | |||
res_cambricon[change_last] = | |||
run(atlas_opr.get(), dev_buf, !change_last); | |||
res_naive[change_last] = | |||
run(naive_opr.get(), buf.data(), false); | |||
} | |||
megdnn_free(handle_atlas(), dev_ptr); | |||
ASSERT_EQ(res_naive[0], res_cambricon[0]) | |||
<< "failed for size " << size; | |||
ASSERT_EQ(res_naive[1], res_cambricon[1]); | |||
ASSERT_NE(res_cambricon[0], res_cambricon[1]); | |||
}; | |||
for (size_t i = 0; i < 8; ++i) { | |||
run_offsset(i); | |||
} | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,55 @@ | |||
/** | |||
* \file dnn/test/atlas/fixture.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "test/atlas/fixture.h" | |||
#include "src/atlas/handle.h" | |||
#include "src/atlas/megcore/device_context.hpp" | |||
#include "src/atlas/utils.h" | |||
#include "test/common/memory_manager.h" | |||
#include "test/common/random_state.h" | |||
#include "test/common/utils.h" | |||
#include "acl/acl.h" | |||
#include <cstdlib> | |||
using namespace megdnn; | |||
using namespace test; | |||
void ATLAS::SetUp() { | |||
RandomState::reset(); | |||
// use card 0 | |||
megcore_check( | |||
megcoreCreateDeviceHandle(&m_dev_handle, megcorePlatformAtlas, 0)); | |||
megcoreActivate(m_dev_handle); | |||
megcoreComputingHandle_t comp_handle; | |||
megcore_check(megcoreCreateComputingHandle(&comp_handle, m_dev_handle)); | |||
m_handle_atlas = Handle::make(comp_handle); | |||
megdnn_assert(m_handle_atlas); | |||
} | |||
Handle* ATLAS::handle_naive() { | |||
if (!m_handle_naive) | |||
m_handle_naive = create_cpu_handle(2); | |||
return m_handle_naive.get(); | |||
} | |||
void ATLAS::TearDown() { | |||
m_handle_naive.reset(); | |||
m_handle_atlas.reset(); | |||
MemoryManagerHolder::instance()->clear(); | |||
megcoreDeactivate(m_dev_handle); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file dnn/test/atlas/fixture.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include <gtest/gtest.h> | |||
#include "megcore_cdefs.h" | |||
#include "megdnn/handle.h" | |||
#include <memory> | |||
namespace megdnn { | |||
namespace test { | |||
class ATLAS : public ::testing::Test { | |||
public: | |||
void SetUp() override; | |||
void TearDown() override; | |||
Handle* handle_atlas() { return m_handle_atlas.get(); } | |||
Handle* handle_naive(); | |||
private: | |||
std::unique_ptr<Handle> m_handle_naive; | |||
std::unique_ptr<Handle> m_handle_atlas; | |||
megcoreDeviceHandle_t m_dev_handle; | |||
}; | |||
} // namespace test | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,78 @@ | |||
/** | |||
* \file dnn/test/cambricon/checksum.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megdnn/oprs.h" | |||
#include "test/cambricon/fixture.h" | |||
#include "test/common/checker.h" | |||
using namespace megdnn; | |||
using namespace test; | |||
TEST_F(CAMBRICON, CHECKSUM_FORWARD) { | |||
auto cambricon_opr = | |||
handle_cambricon()->create_operator<megdnn::Checksum>(), | |||
naive_opr = handle_naive()->create_operator<megdnn::Checksum>(); | |||
std::mt19937 rng(std::random_device{}()); | |||
for (size_t size : | |||
{3, 8, 4 * 4 * 1024, 12345, 1024 * 1024, 1024 * 1024 * 10}) { | |||
auto aligned_size = size + ((512 - size % 512) % 512); | |||
auto run = [&](megdnn::Checksum* opr, void* ptr, bool log_size) { | |||
TensorND tensor; | |||
tensor.raw_ptr = ptr; | |||
tensor.layout.init_contiguous_stride({size}); | |||
tensor.layout.dtype = dtype::Byte(); | |||
WorkspaceWrapper workspace( | |||
handle_cambricon(), | |||
opr->get_workspace_in_bytes(tensor.layout)); | |||
if (log_size) { | |||
printf("checksum(%zu): workspace=%zu\n", size, | |||
workspace.workspace().size); | |||
} | |||
return opr->exec(tensor, workspace.workspace()); | |||
}; | |||
std::vector<uint8_t> buf(aligned_size); | |||
for (size_t i = 0; i < size; ++i) | |||
buf[i] = 1; | |||
auto run_offsset = [&](size_t offset) { | |||
void* dev_ptr = | |||
megdnn_malloc(handle_cambricon(), buf.size() + offset); | |||
void* dev_buf = static_cast<char*>(dev_ptr) + offset; | |||
Checksum::Result res_cambricon[2], res_naive[2]; | |||
for (int change_last = 0; change_last < 2; ++change_last) { | |||
if (change_last) | |||
++buf[size - 1]; | |||
megdnn_memcpy_H2D(handle_cambricon(), dev_buf, buf.data(), | |||
size); | |||
res_cambricon[change_last] = | |||
run(cambricon_opr.get(), dev_buf, !change_last); | |||
res_naive[change_last] = | |||
run(naive_opr.get(), buf.data(), false); | |||
} | |||
megdnn_free(handle_cambricon(), dev_ptr); | |||
ASSERT_EQ(res_naive[0], res_cambricon[0]) | |||
<< "failed for size " << size; | |||
ASSERT_EQ(res_naive[1], res_cambricon[1]); | |||
ASSERT_NE(res_cambricon[0], res_cambricon[1]); | |||
}; | |||
for (size_t i = 0; i < 8; ++i) { | |||
run_offsset(i); | |||
} | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,51 @@ | |||
/** | |||
* \file dnn/test/cambricon/fixture.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "test/cambricon/fixture.h" | |||
#include "src/cambricon/handle.h" | |||
#include "src/cambricon/utils.h" | |||
#include "test/common/memory_manager.h" | |||
#include "test/common/random_state.h" | |||
#include "test/common/utils.h" | |||
#include <cnrt.h> | |||
#include <cstdlib> | |||
using namespace megdnn; | |||
using namespace test; | |||
void CAMBRICON::SetUp() { | |||
RandomState::reset(); | |||
megcoreDeviceHandle_t dev_handle; | |||
// use card 0 | |||
megcore_check(megcoreCreateDeviceHandle(&dev_handle, | |||
megcorePlatformCambricon, 0)); | |||
megcoreComputingHandle_t comp_handle; | |||
megcore_check(megcoreCreateComputingHandle(&comp_handle, dev_handle)); | |||
m_handle_cambricon = Handle::make(comp_handle); | |||
megdnn_assert(m_handle_cambricon); | |||
} | |||
Handle* CAMBRICON::handle_naive() { | |||
if (!m_handle_naive) | |||
m_handle_naive = create_cpu_handle(2); | |||
return m_handle_naive.get(); | |||
} | |||
void CAMBRICON::TearDown() { | |||
m_handle_naive.reset(); | |||
m_handle_cambricon.reset(); | |||
MemoryManagerHolder::instance()->clear(); | |||
} | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file dnn/test/cambricon/fixture.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include <gtest/gtest.h> | |||
#include "test/common/fix_gtest_on_platforms_without_exception.inl" | |||
#include "megcore_cdefs.h" | |||
#include "megdnn/handle.h" | |||
#include <memory> | |||
namespace megdnn { | |||
namespace test { | |||
class CAMBRICON : public ::testing::Test { | |||
public: | |||
void SetUp() override; | |||
void TearDown() override; | |||
Handle* handle_cambricon() { return m_handle_cambricon.get(); } | |||
Handle* handle_naive(); | |||
private: | |||
std::unique_ptr<Handle> m_handle_naive; | |||
std::unique_ptr<Handle> m_handle_cambricon; | |||
}; | |||
} // namespace test | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
@@ -0,0 +1,54 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# pylint: disable=too-many-lines | |||
from typing import List | |||
import megengine._internal as mgb | |||
from ..core import Tensor, wrap_io_tensor | |||
@wrap_io_tensor | |||
def cambricon_subgraph( | |||
inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool, | |||
) -> List[Tensor]: | |||
"""Load a serialized Cambricon subgraph (i.e. cnrtModel_t) and | |||
execute the operations defined in the subgraph. | |||
:param inputs: List of input tensors of the subgraph. | |||
:param data: The serialized subgraph. | |||
:param symbol: The name of the function in the subgraph. | |||
The function is corresponding to a cnmlFusionOp | |||
which is added to the cnmlModel_t/cnrtModel_t. | |||
:param tensor_dim_mutable: Whether the input tensors' shapes are mutalbe | |||
in cnrtModel_t | |||
""" | |||
return mgb.opr.cambricon_runtime( | |||
data, symbol, tuple(map(lambda x: x._symvar, inputs)), tensor_dim_mutable | |||
) | |||
@wrap_io_tensor | |||
def extern_opr_subgraph( | |||
inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes, | |||
) -> List[Tensor]: | |||
"""Load a serialized extern opr subgraph and fake execute the operator | |||
:param inputs: Tensor or list of input tensors. | |||
:param output_shapes: The output shapes. | |||
:param dump_name: The serialized subgraph name. | |||
:param dump_data: The serialized subgraph. | |||
:return: List of tensors | |||
""" | |||
if not isinstance(inputs, list): | |||
inputs = [inputs] | |||
return mgb.opr.extern_c_opr_placeholder( | |||
inputs, output_shapes, dump_name=dump_name, dump_data=dump_data, | |||
) |
@@ -0,0 +1,56 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
import numpy as np | |||
from ..functional.external import cambricon_subgraph, extern_opr_subgraph | |||
from .module import Module | |||
class CambriconSubgraph(Module): | |||
r"""Load a serialized Cambricon subgraph. | |||
See :func:`~.cambricon_subgraph` for more details. | |||
""" | |||
def __init__( | |||
self, data, symbol, tensor_dim_mutable, | |||
): | |||
super(CambriconSubgraph, self).__init__() | |||
self._data = data | |||
self.symbol = symbol | |||
self.tensor_dim_mutable = tensor_dim_mutable | |||
@property | |||
def data(self): | |||
return self._data.tobytes() | |||
@data.setter | |||
def data(self, val): | |||
self._data = np.frombuffer(val, dtype=np.uint8) | |||
def forward(self, inputs): | |||
outputs = cambricon_subgraph( | |||
inputs, self._data, self.symbol, self.tensor_dim_mutable, | |||
) | |||
return outputs | |||
class ExternOprSubgraph(Module): | |||
r"""Load a serialized extern opr subgraph. | |||
""" | |||
def __init__(self, data, name, output_shapes): | |||
super(ExternOprSubgraph, self).__init__() | |||
self.data = data | |||
self.name = name | |||
self.output_shapes = output_shapes | |||
def forward(self, inputs): | |||
outputs = extern_opr_subgraph(inputs, self.output_shapes, self.name, self.data,) | |||
return outputs |
@@ -246,6 +246,27 @@ SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs, | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
#include "megbrain/opr/atlas_runtime_op.h" | |||
SymbolVarArray _Opr::atlas_runtime(const SymbolVarArray& inputs, | |||
PyObject* data_bytes, | |||
const OperatorNodeConfig& config) { | |||
mgb_assert(PyBytes_Check(data_bytes)); | |||
auto size = PyBytes_Size(data_bytes); | |||
mgb_assert(size, "atlas data bytes should not be empty"); | |||
return opr::AtlasRuntimeOpr::make(PyBytes_AsString(data_bytes), size, | |||
inputs, config); | |||
} | |||
#else | |||
SymbolVarArray _Opr::atlas_runtime(const SymbolVarArray& inputs, | |||
PyObject* data_bytes, | |||
const OperatorNodeConfig& config) { | |||
mgb_throw(MegBrainError, "Atlas disabled at compile time"); | |||
} | |||
#endif | |||
SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off, | |||
const OperatorNodeConfig& config) { | |||
@@ -266,4 +287,27 @@ SymbolVar _Opr::virtual_dep(const SymbolVarArray& symvars, | |||
} | |||
#if MGB_CAMBRICON | |||
#include "megbrain/cambricon/cambricon_runtime_opr.h" | |||
SymbolVarArray _Opr::cambricon_runtime(PyObject* data_bytes, const char* symbol, | |||
const SymbolVarArray& inputs, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config) { | |||
mgb_assert(PyBytes_Check(data_bytes)); | |||
auto size = PyBytes_Size(data_bytes); | |||
mgb_assert(size, "cambricon data bytes should not be empty"); | |||
return opr::CambriconRuntimeOpr::make(PyBytes_AsString(data_bytes), size, | |||
symbol, inputs, tensor_dim_mutable, | |||
config); | |||
} | |||
#else | |||
SymbolVarArray _Opr::cambricon_runtime(PyObject* data_bytes, const char* symbol, | |||
const SymbolVarArray& inputs, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config) { | |||
mgb_throw(MegBrainError, "Cambricon disabled at compile time"); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -130,6 +130,16 @@ static SymbolVar virtual_loss(const SymbolVarArray& ys, | |||
static SymbolVar virtual_dep(const SymbolVarArray& symvars, | |||
const OperatorNodeConfig& config); | |||
static SymbolVarArray atlas_runtime(const SymbolVarArray& inputs, | |||
PyObject* data_bytes, | |||
const OperatorNodeConfig& config); | |||
static SymbolVarArray cambricon_runtime(PyObject* data_bytes, | |||
const char* symbol, | |||
const SymbolVarArray& inputs, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config); | |||
#ifdef SWIG | |||
%pythoncode { | |||
@@ -0,0 +1,43 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
import os | |||
import numpy as np | |||
import megengine as mge | |||
from megengine import tensor | |||
from megengine.module import Module | |||
from megengine.module.external import CambriconSubgraph | |||
class MyModule(Module): | |||
def __init__(self, data): | |||
super().__init__() | |||
self.cambricon = CambriconSubgraph(data, "subnet0", True) | |||
def forward(self, inputs): | |||
out = self.cambricon(inputs) | |||
return out | |||
def test_cambricon_module(): | |||
model = "CambriconRuntimeOprTest.MutableBatchSize.mlu" | |||
model = os.path.join(os.path.dirname(__file__), model) | |||
with open(model, "rb") as f: | |||
data = f.read() | |||
m = MyModule(data) | |||
inputs = [] | |||
inputs.append(tensor(dtype=np.float16, device="cambricon0")) | |||
inputs[0].set_value(np.random.normal(size=(1, 64, 32, 32)).astype(np.float16)) | |||
def inference(inps): | |||
pred = m(inps) | |||
return pred | |||
pred = inference(inputs) |
@@ -33,6 +33,14 @@ if(MGE_WITH_CUDA AND MGE_WITH_TRT) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/cambricon/include) | |||
file(GLOB_RECURSE SOURCES_ cambricon/impl/*.cpp cambricon/impl/*.inl) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
endif() | |||
set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON}) | |||
set(MGB_ATLAS ${MGE_WITH_ATLAS}) | |||
if(MGE_WITH_CUDA) | |||
file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu) | |||
@@ -77,6 +85,8 @@ if(MGE_WITH_DISTRIBUTED) | |||
target_link_libraries (megbrain PRIVATE megray) | |||
endif() | |||
target_link_libraries(megbrain PRIVATE ${MGE_CUDA_LIBS}) | |||
target_link_libraries(megbrain PUBLIC ${MGE_CAMBRICON_LIBS}) | |||
target_link_libraries(megbrain PUBLIC ${MGE_ATLAS_LIBS}) | |||
if(MGE_WITH_JIT AND MGE_WITH_HALIDE) | |||
target_link_libraries(megbrain PRIVATE libhalide) | |||
target_link_libraries(megbrain PRIVATE ${HALIDE_LLVM_LIBS}) | |||
@@ -0,0 +1,320 @@ | |||
/** | |||
* \file src/cambricon/impl/cambricon_runtime_opr.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megbrain/cambricon/cambricon_runtime_opr.h" | |||
#include "megbrain/common.h" | |||
#if MGB_CAMBRICON | |||
using namespace mgb; | |||
using namespace opr; | |||
namespace { | |||
SmallVector<int> mgb_shape_to_cnrt_shape(TensorShape mgb_shp) { | |||
int ndim = mgb_shp.ndim; | |||
SmallVector<int> cnrt_shp(ndim); | |||
for (int i = 0; i < ndim; ++i) { | |||
cnrt_shp[i] = mgb_shp[i]; | |||
} | |||
return cnrt_shp; | |||
} | |||
TensorShape cnrt_shape_to_mgb_shape(int* dim_values, int dim_num) { | |||
TensorShape ret; | |||
ret.ndim = dim_num; | |||
for (int i = 0; i < dim_num; ++i) { | |||
ret[i] = dim_values[i]; | |||
} | |||
return ret; | |||
} | |||
DType cnrt_dtype_to_mgb_dtype(cnrtDataType_t data_type) { | |||
switch (data_type) { | |||
case CNRT_FLOAT16: | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
return dtype::Float16(); | |||
#else | |||
mgb_throw(MegBrainError, | |||
"Float16 support is disabled at compile time."); | |||
#endif | |||
case CNRT_FLOAT32: | |||
return dtype::Float32(); | |||
case CNRT_INT8: | |||
return dtype::QuantizedS8(1.f); | |||
case CNRT_INT16: | |||
return dtype::Int16(); | |||
case CNRT_INT32: | |||
return dtype::Int32(); | |||
case CNRT_UINT8: | |||
return dtype::Uint8(); | |||
//! TODO: check scale | |||
case CNRT_QUANT8: | |||
return dtype::QuantizedS8(1.f); | |||
default: | |||
mgb_throw(MegBrainError, | |||
"cnrtDataType %x is not supported by MegBrain.", | |||
data_type); | |||
} | |||
} | |||
cnrtDataType_t mgb_dtype_to_cnrt_dtype(DType data_type) { | |||
switch (data_type.enumv()) { | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
case DTypeEnum::Float16: | |||
return CNRT_FLOAT16; | |||
#endif | |||
case DTypeEnum::Float32: | |||
return CNRT_FLOAT32; | |||
case DTypeEnum::QuantizedS8: | |||
return CNRT_QUANT8; | |||
case DTypeEnum::Int32: | |||
return CNRT_INT32; | |||
default: | |||
mgb_throw(MegBrainError, | |||
"megbrain data type %s is not supported by cnrt.", | |||
data_type.name()); | |||
} | |||
} | |||
}; // namespace | |||
/* ====================== CambriconRuntimeOpr ==================== */ | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CambriconRuntimeOpr); | |||
CambriconRuntimeOpr::CambriconRuntimeOpr(SharedBuffer buf, std::string symbol, | |||
const VarNodeArray& inputs, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config) | |||
: Super(inputs[0]->owner_graph(), config, "cambricon_runtime", inputs), | |||
m_buffer{std::move(buf)}, | |||
m_symbol{std::move(symbol)}, | |||
m_model{nullptr}, | |||
m_function{nullptr}, | |||
m_context{nullptr}, | |||
m_tensor_dim_mutable{tensor_dim_mutable} { | |||
mgb_assert(inputs[0]->comp_node().device_type() == | |||
CompNode::DeviceType::CAMBRICON, | |||
"CambriconRuntimeOpr can only be used on cambricon comp node; " | |||
"got %s", | |||
inputs[0]->comp_node().to_string().c_str()); | |||
for (auto i : inputs) { | |||
add_input({i}); | |||
} | |||
if (m_model == nullptr) { | |||
m_model = {new cnrtModel_t(), cnrt_intl::ModelUnloader()}; | |||
MGB_CNRT_CHECK(cnrtLoadModelFromMem( | |||
m_model.get(), | |||
reinterpret_cast<char*>(const_cast<void*>(m_buffer.data())))); | |||
} | |||
if (m_function == nullptr) { | |||
m_function = {new cnrtFunction_t(), cnrt_intl::FunctionDeleter()}; | |||
MGB_CNRT_CHECK(cnrtCreateFunction(m_function.get())); | |||
MGB_CNRT_CHECK(cnrtExtractFunction(m_function.get(), *m_model, | |||
m_symbol.c_str())); | |||
} | |||
int nr_inputs = 0; | |||
int nr_outputs = 0; | |||
int64_t* inputs_size = nullptr; | |||
int64_t* outputs_size = nullptr; | |||
MGB_CNRT_CHECK(cnrtGetInputDataSize(&inputs_size, &nr_inputs, *m_function)); | |||
mgb_assert(static_cast<size_t>(nr_inputs) == inputs.size(), | |||
"inputs size mismatch: expect=%d, got=%zu", nr_inputs, | |||
inputs.size()); | |||
MGB_CNRT_CHECK( | |||
cnrtGetOutputDataSize(&outputs_size, &nr_outputs, *m_function)); | |||
if (nr_outputs == 1) { | |||
add_output(None); | |||
} else { | |||
for (int i = 0; i < nr_outputs; ++i) { | |||
add_output(ssprintf("o%d", i)); | |||
} | |||
} | |||
add_equivalence_component<mgb::ScalarHash<const void*>>(m_buffer.data()); | |||
}; | |||
void CambriconRuntimeOpr::scn_do_execute() { | |||
mgb_assert(m_function != nullptr); | |||
auto&& cnrt_env = | |||
CompNodeEnv::from_comp_node(input(0)->comp_node()).cnrt_env(); | |||
cnrt_env.activate(); | |||
if (m_context == nullptr) { | |||
m_context = {new cnrtRuntimeContext_t(), | |||
cnrt_intl::RuntimeContextDeleter()}; | |||
MGB_CNRT_CHECK(cnrtCreateRuntimeContext(m_context.get(), *m_function, | |||
nullptr)); | |||
int dev_id = cnrt_env.device; | |||
MGB_CNRT_CHECK(cnrtSetRuntimeContextDeviceId(*m_context, dev_id)); | |||
MGB_CNRT_CHECK(cnrtInitRuntimeContext(*m_context, nullptr)); | |||
} | |||
size_t nr_inputs = input().size(), nr_outputs = output().size(); | |||
SmallVector<void*> params(nr_inputs + nr_outputs); | |||
SmallVector<cnrtParamDesc_t> param_descs(nr_inputs + nr_outputs); | |||
for (size_t i = 0; i < nr_inputs; ++i) { | |||
params[i] = input(i)->dev_tensor().raw_ptr(); | |||
MGB_CNRT_CHECK(cnrtCreateParamDesc(¶m_descs[i])); | |||
MGB_CNRT_CHECK(cnrtSetDataTypeToParamDesc( | |||
param_descs[i], mgb_dtype_to_cnrt_dtype(input(i)->dtype()))); | |||
auto dims = mgb_shape_to_cnrt_shape(input(i)->shape()); | |||
MGB_CNRT_CHECK(cnrtSetShapeToParamDesc(param_descs[i], dims.data(), | |||
static_cast<int>(dims.size()))); | |||
} | |||
for (size_t i = 0; i < nr_outputs; ++i) { | |||
params[nr_inputs + i] = output(i)->dev_tensor().raw_ptr(); | |||
MGB_CNRT_CHECK(cnrtCreateParamDesc(¶m_descs[nr_inputs + i])); | |||
MGB_CNRT_CHECK(cnrtSetDataTypeToParamDesc( | |||
param_descs[nr_inputs + i], | |||
mgb_dtype_to_cnrt_dtype(output(i)->dtype()))); | |||
auto dims = mgb_shape_to_cnrt_shape(output(i)->shape()); | |||
MGB_CNRT_CHECK(cnrtSetShapeToParamDesc(param_descs[nr_inputs + i], | |||
dims.data(), | |||
static_cast<int>(dims.size()))); | |||
} | |||
MGB_CNRT_CHECK(cnrtInvokeRuntimeContext_V2(*m_context, param_descs.data(), | |||
params.data(), cnrt_env.queue, | |||
nullptr)); | |||
for (auto& param : param_descs) { | |||
MGB_CNRT_CHECK(cnrtDestroyParamDesc(param)); | |||
} | |||
} | |||
void CambriconRuntimeOpr::get_output_var_shape( | |||
const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { | |||
mgb_assert(m_function != nullptr); | |||
mgb_assert(input().size() == inp_shape.size()); | |||
if (m_tensor_dim_mutable) { | |||
cnrtParamDescArray_t input_descs, output_descs; | |||
int inp_param_num = input().size(); | |||
int out_param_num = output().size(); | |||
MGB_CNRT_CHECK(cnrtCreateParamDescArray(&input_descs, inp_param_num)); | |||
MGB_CNRT_CHECK(cnrtCreateParamDescArray(&output_descs, out_param_num)); | |||
for (int i = 0; i < inp_param_num; ++i) { | |||
MGB_CNRT_CHECK(cnrtSetDataTypeToParamDesc( | |||
input_descs[i], | |||
mgb_dtype_to_cnrt_dtype(input(i)->dtype()))); | |||
auto dims = mgb_shape_to_cnrt_shape(inp_shape[i]); | |||
MGB_CNRT_CHECK( | |||
cnrtSetShapeToParamDesc(input_descs[i], dims.data(), | |||
static_cast<int>(dims.size()))); | |||
} | |||
MGB_CNRT_CHECK(cnrtInferFunctionOutputShape(*m_function, inp_param_num, | |||
input_descs, out_param_num, | |||
output_descs)); | |||
for (int i = 0; i < out_param_num; ++i) { | |||
int* dims = nullptr; | |||
int dim_num = 0; | |||
MGB_CNRT_CHECK(cnrtGetShapeFromParamDesc(output_descs[i], &dims, | |||
&dim_num)); | |||
out_shape[i] = cnrt_shape_to_mgb_shape(dims, dim_num); | |||
} | |||
MGB_CNRT_CHECK(cnrtDestroyParamDescArray(input_descs, inp_param_num)); | |||
MGB_CNRT_CHECK(cnrtDestroyParamDescArray(output_descs, out_param_num)); | |||
} else { | |||
//! check input shape match | |||
for (size_t i = 0; i < inp_shape.size(); ++i) { | |||
int* dim_values = nullptr; | |||
int dim_num = 0; | |||
MGB_CNRT_CHECK(cnrtGetInputDataShape( | |||
&dim_values, &dim_num, static_cast<int>(i), *m_function)); | |||
auto shp_in_func = cnrt_shape_to_mgb_shape(dim_values, dim_num); | |||
auto inpshp = inp_shape[i]; | |||
MGB_MARK_USED_VAR(shp_in_func); | |||
mgb_assert( | |||
inpshp.eq_shape(shp_in_func), | |||
"input shape(%s) mismatch with that(%s) in cnrtFunction_t.", | |||
inpshp.to_string().c_str(), | |||
shp_in_func.to_string().c_str()); | |||
} | |||
//! remarks: cnrt does not provide interface to let user manage | |||
//! workspace | |||
MGB_MARK_USED_VAR(mgb_dtype_to_cnrt_dtype); | |||
for (size_t i = 0; i < out_shape.size(); ++i) { | |||
int* dim_values = nullptr; | |||
int dim_num = 0; | |||
MGB_CNRT_CHECK(cnrtGetOutputDataShape( | |||
&dim_values, &dim_num, static_cast<int>(i), *m_function)); | |||
out_shape[i] = cnrt_shape_to_mgb_shape(dim_values, dim_num); | |||
} | |||
} | |||
} | |||
void CambriconRuntimeOpr::add_input_layout_constraint() { | |||
//! default contiguous | |||
for (auto i : input()) { | |||
i->add_layout_constraint_contiguous(); | |||
} | |||
} | |||
void CambriconRuntimeOpr::init_output_dtype() { | |||
cnrtDataType_t* inp_dtype_array = nullptr; | |||
int inp_num; | |||
MGB_CNRT_CHECK( | |||
cnrtGetInputDataType(&inp_dtype_array, &inp_num, *m_function)); | |||
for (size_t i = 0; i < input().size(); ++i) { | |||
auto dt_cnrt = cnrt_dtype_to_mgb_dtype(inp_dtype_array[i]); | |||
auto dt_inp = input(i)->dtype(); | |||
MGB_MARK_USED_VAR(dt_cnrt); | |||
MGB_MARK_USED_VAR(dt_inp); | |||
mgb_assert(dt_cnrt.valid() && dt_inp.valid() && | |||
dt_cnrt.enumv() == dt_inp.enumv(), | |||
"Input %zu's data type mismatch with that in " | |||
"cnrtFunction_t: expected %s, got %s", | |||
i, dt_cnrt.name(), dt_inp.name()); | |||
} | |||
cnrtDataType_t* out_dtype_array = nullptr; | |||
int out_num; | |||
MGB_CNRT_CHECK( | |||
cnrtGetOutputDataType(&out_dtype_array, &out_num, *m_function)); | |||
for (size_t i = 0; i < output().size(); ++i) { | |||
auto dt_cnrt = cnrt_dtype_to_mgb_dtype(out_dtype_array[i]); | |||
mgb_assert(dt_cnrt.valid(), | |||
"output dtype checking failed: invalid dtype returned."); | |||
if (dt_cnrt.enumv() == DTypeEnum::QuantizedS8) { | |||
mgb_assert(output(i)->dtype().valid(), | |||
"user should specify scale of output tensor of " | |||
"CambriconRuntimeOpr."); | |||
} | |||
if (!output(i)->dtype().valid()) | |||
output(i)->dtype(dt_cnrt); | |||
} | |||
} | |||
SymbolVarArray CambriconRuntimeOpr::make(SharedBuffer buf, std::string symbol, | |||
const SymbolVarArray& src, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config) { | |||
VarNodeArray var_node_array = cg::to_var_node_array(src); | |||
auto cambricon_runtime_opr = std::make_unique<CambriconRuntimeOpr>( | |||
std::move(buf), std::move(symbol), var_node_array, | |||
tensor_dim_mutable, config); | |||
auto ret = cg::to_symbol_var_array( | |||
src[0].node() | |||
->owner_graph() | |||
->insert_opr(std::move(cambricon_runtime_opr)) | |||
->output()); | |||
return ret; | |||
} | |||
SymbolVarArray CambriconRuntimeOpr::make(const void* buf, size_t size, | |||
std::string symbol, | |||
const SymbolVarArray& src, | |||
bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config) { | |||
mgb_throw_if(!CompNode::get_device_count(CompNode::DeviceType::CAMBRICON), | |||
SystemError, | |||
"can not create CambriconRuntimeOpr when Cambricon is not " | |||
"available"); | |||
std::shared_ptr<uint8_t> shptr{new uint8_t[size], | |||
[](uint8_t* p) { delete[] p; }}; | |||
memcpy(shptr.get(), buf, size); | |||
SharedBuffer buffer{std::move(shptr), size}; | |||
return make(std::move(buffer), std::move(symbol), src, tensor_dim_mutable, | |||
config); | |||
} | |||
#endif // MGB_CAMBRICON | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,20 @@ | |||
decl_raw_opr( | |||
'cambricon_runtime', | |||
desc='create an operator that could load and run cnrt offline models', | |||
inputs=[ | |||
Doc('data_bytes', 'serialized cnrt/cnml model'), | |||
Doc('symbol', 'name of cnrt/cnml function', 'str'), | |||
Doc('inputs', 'input vars', 'list of :class:`.SymbolVar`'), | |||
Doc('tensor_dim_mutable', 'whether tensor shape is mutable in cnrt/cnml model', 'bool'), | |||
], | |||
body=[ | |||
'assert isinstance(data_bytes, bytes), ' | |||
'"data must be bytes; got {}".format(type(data_bytes))', | |||
'output = _mgb._Opr.cambricon_runtime(data_bytes, symbol, inputs, tensor_dim_mutable, config)', | |||
'cvt_result_kwargs["explode_single"] = False', | |||
], | |||
) | |||
# vim: ft=python | |||
@@ -0,0 +1,71 @@ | |||
/** | |||
* \file src/cambricon/impl/cambricon_runtime_opr.sereg.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megbrain/cambricon/cambricon_runtime_opr.h" | |||
#include "megbrain/serialization/sereg.h" | |||
namespace mgb { | |||
namespace serialization { | |||
template <> | |||
struct OprLoadDumpImpl<opr::CambriconRuntimeOpr, 0> { | |||
static void dump(OprDumpContext& ctx, const cg::OperatorNodeBase& opr_) { | |||
auto&& opr = opr_.cast_final_safe<opr::CambriconRuntimeOpr>(); | |||
auto&& buf = opr.buffer(); | |||
ctx.dump_buf_with_len(buf.data(), buf.size()); | |||
auto&& symbol = opr.symbol(); | |||
ctx.dump_buf_with_len(symbol.data(), symbol.size()); | |||
bool tensor_dim_mutable = opr.is_tensor_dim_mutable(); | |||
ctx.dump_buf_with_len(&tensor_dim_mutable, sizeof(bool)); | |||
} | |||
static cg::OperatorNodeBase* load(OprLoadContext& ctx, | |||
const cg::VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) { | |||
inputs.at(0)->comp_node().activate(); | |||
auto buf = ctx.load_shared_buf_with_len(); | |||
auto symbol = ctx.load_buf_with_len(); | |||
auto tensor_dim_mutable_storage = ctx.load_buf_with_len(); | |||
bool tensor_dim_mutable; | |||
memcpy(&tensor_dim_mutable, tensor_dim_mutable_storage.data(), | |||
sizeof(bool)); | |||
return opr::CambriconRuntimeOpr::make(std::move(buf), std::move(symbol), | |||
cg::to_symbol_var_array(inputs), | |||
tensor_dim_mutable, config) | |||
.at(0) | |||
.node() | |||
->owner_opr(); | |||
} | |||
}; | |||
} // namespace serialization | |||
namespace opr { | |||
cg::OperatorNodeBase* opr_shallow_copy_cambricon_runtime_opr( | |||
const serialization::OprShallowCopyContext& ctx, | |||
const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) { | |||
auto&& opr = opr_.cast_final_safe<CambriconRuntimeOpr>(); | |||
return CambriconRuntimeOpr::make(opr.buffer(), opr.symbol(), | |||
cg::to_symbol_var_array(inputs), | |||
opr.is_tensor_dim_mutable(), config) | |||
.at(0) | |||
.node() | |||
->owner_opr(); | |||
} | |||
MGB_SEREG_OPR(CambriconRuntimeOpr, 0); | |||
MGB_REG_OPR_SHALLOW_COPY(CambriconRuntimeOpr, | |||
opr_shallow_copy_cambricon_runtime_opr); | |||
} // namespace opr | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -0,0 +1,100 @@ | |||
/** | |||
* \file src/cambricon/include/megbrain/cambricon/cambricon_runtime_opr.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/graph.h" | |||
#include "megbrain/serialization/file.h" | |||
#if MGB_CAMBRICON | |||
namespace mgb { | |||
namespace opr { | |||
namespace cnrt_intl { | |||
struct ModelUnloader { | |||
void operator()(cnrtModel_t* model) { | |||
if (model != nullptr) | |||
MGB_CNRT_CHECK(cnrtUnloadModel(*model)); | |||
} | |||
}; | |||
struct FunctionDeleter { | |||
void operator()(cnrtFunction_t* function) { | |||
if (function != nullptr) | |||
MGB_CNRT_CHECK(cnrtDestroyFunction(*function)); | |||
} | |||
}; | |||
struct RuntimeContextDeleter { | |||
void operator()(cnrtRuntimeContext_t* context) { | |||
if (context != nullptr) | |||
MGB_CNRT_CHECK(cnrtDestroyRuntimeContext(*context)); | |||
} | |||
}; | |||
using CnrtModelUniquePtr = std::unique_ptr<cnrtModel_t, ModelUnloader>; | |||
using CnrtFunctionUniquePtr = std::unique_ptr<cnrtFunction_t, FunctionDeleter>; | |||
using CnrtRuntimeContextUniquePtr = | |||
std::unique_ptr<cnrtRuntimeContext_t, RuntimeContextDeleter>; | |||
}; // namespace cnrt_intl | |||
MGB_DEFINE_OPR_CLASS(CambriconRuntimeOpr, cg::SingleCNOutshapePureByInshapeOprBase) // { | |||
public: | |||
using CnrtModelUniquePtr = cnrt_intl::CnrtModelUniquePtr; | |||
using CnrtFunctionUniquePtr = cnrt_intl::CnrtFunctionUniquePtr; | |||
using CnrtRuntimeContextUniquePtr = cnrt_intl::CnrtRuntimeContextUniquePtr; | |||
using SharedBuffer = mgb::serialization::SharedBuffer; | |||
void scn_do_execute() override; | |||
void get_output_var_shape(const TensorShapeArray& inp_shape, | |||
TensorShapeArray& out_shape) const override; | |||
void add_input_layout_constraint() override; | |||
void init_output_dtype() override; | |||
CambriconRuntimeOpr(SharedBuffer buf, std::string symbol, | |||
const VarNodeArray& inputs, bool tensor_dim_mutable, | |||
const OperatorNodeConfig& config); | |||
const SharedBuffer& buffer() const { | |||
return m_buffer; | |||
} | |||
const std::string& symbol() const { | |||
return m_symbol; | |||
} | |||
bool is_tensor_dim_mutable() const { | |||
return m_tensor_dim_mutable; | |||
} | |||
static SymbolVarArray make(SharedBuffer buf, std::string symbol, | |||
const SymbolVarArray& src, | |||
bool tensor_dim_mutable = false, | |||
const OperatorNodeConfig& config = {}); | |||
static SymbolVarArray make(const void* buf, size_t size, std::string symbol, | |||
const SymbolVarArray& src, | |||
bool tensor_dim_mutable = false, | |||
const OperatorNodeConfig& config = {}); | |||
private: | |||
SharedBuffer m_buffer; | |||
std::string m_symbol; | |||
CnrtModelUniquePtr m_model; | |||
CnrtFunctionUniquePtr m_function; | |||
CnrtRuntimeContextUniquePtr m_context; | |||
bool m_tensor_dim_mutable; | |||
}; | |||
} // namespace opr | |||
} // namespace mgb | |||
#endif // MGB_CAMBRICON | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -0,0 +1,562 @@ | |||
/** | |||
* \file src/cambricon/test/cambricon_runtime_opr.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/plugin/profiler.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/test/helper.h" | |||
#if MGB_CAMBRICON | |||
#include "megbrain/cambricon/cambricon_runtime_opr.h" | |||
using namespace mgb; | |||
namespace { | |||
class CnmlModelContext { | |||
public: | |||
const CompNode& cn; | |||
bool batch_size_changable; | |||
cnmlModel_t model; | |||
cnmlTensor_t conv_input_tensor, relu_output_tensor; | |||
cnmlFusionOp_t fusion_op; | |||
bool built; | |||
CnmlModelContext(const CompNode& cn, bool batch_size_changable = false) | |||
: cn{cn}, | |||
batch_size_changable{batch_size_changable}, | |||
built{false} {} | |||
~CnmlModelContext() { | |||
MGB_CNML_CHECK(cnmlDestroyTensor(&conv_input_tensor)); | |||
MGB_CNML_CHECK(cnmlDestroyTensor(&relu_output_tensor)); | |||
MGB_CNML_CHECK(cnmlDestroyFusionOp(&fusion_op)); | |||
MGB_CNML_CHECK(cnmlDestroyModel(model)); | |||
} | |||
void build() { | |||
auto&& cnrt_env = CompNodeEnv::from_comp_node(cn).cnrt_env(); | |||
cnrt_env.activate(); | |||
constexpr int core_num = 4; | |||
cnrtCoreVersion_t core_version = cnrt_env.device_info.core_version; | |||
// prepare parameter for addpad and conv | |||
constexpr int dim_num = 4; | |||
const int ni = 16, ci = 64, hi = 32, wi = 32; | |||
const int no = 16, co = 64, ho = 32, wo = 32; | |||
const int kh = 3, kw = 3; | |||
const int stride_h = 1, stride_w = 1, dilation = 1; | |||
const int pad_h = 2, pad_w = 2; | |||
// count tensor nums | |||
int conv_filter_count = co * kh * kw * ci; | |||
int conv_bias_count = 1 * 1 * 1 * co; | |||
// prepare cpu origin data | |||
std::vector<float> conv_filter_cpu_data(conv_filter_count); | |||
std::vector<float> conv_bias_cpu_data(conv_bias_count); | |||
// prepare input data for addpad | |||
unsigned int seed = time(0); | |||
// prepare filter data for conv | |||
for (int index = 0; index < conv_filter_count; ++index) { | |||
conv_filter_cpu_data[index] = | |||
((rand_r(&seed) % 200 / 200.0) - 0.5) / 2; | |||
} | |||
// prepare bias data for conv | |||
for (int index = 0; index < conv_bias_count; ++index) { | |||
conv_bias_cpu_data[index] = rand_r(&seed) % 100 / 100.0; | |||
} | |||
// prepare cpu data to converts to mlu memory | |||
std::vector<int16_t> conv_bias_cpu(conv_bias_count); | |||
// converts data format for mlu computing | |||
// converts conv bias data | |||
MGB_CNRT_CHECK(cnrtCastDataType(conv_bias_cpu_data.data(), CNRT_FLOAT32, | |||
conv_bias_cpu.data(), CNRT_FLOAT16, | |||
conv_bias_count, nullptr)); | |||
// u should set value depending op the data or your own needs | |||
int filter_position = -6; | |||
float filter_scale = 1, filter_offset = 0; | |||
// count tensor nums | |||
int conv_input_shape[] = {ni, ci, hi, wi}; | |||
int conv_filter_shape[] = {co, ci, kh, kw}; | |||
int conv_bias_shape[] = {1, co, 1, 1}; | |||
int conv_output_shape[] = {no, co, ho, wo}; | |||
int relu_output_shape[] = {no, co, ho, wo}; | |||
// setup tensors | |||
// setup conv input tensor | |||
conv_input_tensor = nullptr; | |||
MGB_CNML_CHECK(cnmlCreateTensor_V2(&conv_input_tensor, CNML_TENSOR)); | |||
MGB_CNML_CHECK(cnmlSetTensorShape_V2(conv_input_tensor, dim_num, | |||
conv_input_shape, nullptr)); | |||
MGB_CNML_CHECK( | |||
cnmlSetTensorDataType(conv_input_tensor, CNML_DATA_FLOAT16)); | |||
// setup conv filter tensor | |||
cnmlTensor_t conv_filter_tensor = nullptr; | |||
MGB_CNML_CHECK(cnmlCreateTensor_V2(&conv_filter_tensor, CNML_FILTER)); | |||
MGB_CNML_CHECK(cnmlSetTensorShape_V2(conv_filter_tensor, dim_num, | |||
conv_filter_shape, nullptr)); | |||
MGB_CNML_CHECK( | |||
cnmlSetTensorDataType(conv_filter_tensor, CNML_DATA_FLOAT32)); | |||
// setup conv bias tensor | |||
cnmlTensor_t conv_bias_tensor = nullptr; | |||
MGB_CNML_CHECK(cnmlCreateTensor_V2(&conv_bias_tensor, CNML_CONST)); | |||
MGB_CNML_CHECK(cnmlSetTensorShape_V2(conv_bias_tensor, dim_num, | |||
conv_bias_shape, nullptr)); | |||
MGB_CNML_CHECK( | |||
cnmlSetTensorDataType(conv_bias_tensor, CNML_DATA_FLOAT16)); | |||
// setup conv output tensor | |||
cnmlTensor_t conv_output_tensor = nullptr; | |||
MGB_CNML_CHECK(cnmlCreateTensor_V2(&conv_output_tensor, CNML_TENSOR)); | |||
MGB_CNML_CHECK(cnmlSetTensorShape_V2(conv_output_tensor, dim_num, | |||
conv_output_shape, nullptr)); | |||
MGB_CNML_CHECK( | |||
cnmlSetTensorDataType(conv_output_tensor, CNML_DATA_FLOAT16)); | |||
// setup relu output tensor | |||
relu_output_tensor = nullptr; | |||
MGB_CNML_CHECK(cnmlCreateTensor_V2(&relu_output_tensor, CNML_TENSOR)); | |||
MGB_CNML_CHECK(cnmlSetTensorShape_V2(relu_output_tensor, dim_num, | |||
relu_output_shape, nullptr)); | |||
MGB_CNML_CHECK( | |||
cnmlSetTensorDataType(relu_output_tensor, CNML_DATA_FLOAT16)); | |||
// bind filters and bias to cnml const tensor | |||
MGB_CNML_CHECK(cnmlBindConstData_V2( | |||
conv_filter_tensor, conv_filter_cpu_data.data(), false)); | |||
MGB_CNML_CHECK(cnmlBindConstData_V2(conv_bias_tensor, | |||
conv_bias_cpu.data(), false)); | |||
// create conv param and conv op | |||
cnmlBaseOp_t conv_op; | |||
cnmlConvOpParam_t conv_param; | |||
// create relu op | |||
cnmlBaseOp_t relu_op; | |||
// setup conv param | |||
MGB_CNML_CHECK(cnmlCreateConvOpParam(&conv_param, stride_h, stride_w, | |||
dilation, dilation, pad_h, pad_w)); | |||
// setup conv operation | |||
MGB_CNML_CHECK(cnmlCreateConvOp(&conv_op, conv_param, conv_input_tensor, | |||
conv_output_tensor, conv_filter_tensor, | |||
conv_bias_tensor)); | |||
// u should set value depending op the data or your own needs | |||
int input_position = -6; | |||
float input_scale = 1, input_offset = 0; | |||
// prepare input tensor quant param for conv op | |||
cnmlQuantizedParam_t input_quant_param; | |||
MGB_CNML_CHECK(cnmlCreateQuantizedParam( | |||
&input_quant_param, input_position, input_scale, input_offset)); | |||
// setup conv op computing datatype | |||
MGB_CNML_CHECK(cnmlSetOperationComputingDataType( | |||
conv_op, conv_input_tensor, CNML_DATA_INT8, input_quant_param)); | |||
// prepare filter tensor quant param for conv op | |||
cnmlQuantizedParam_t filter_compute_quant; | |||
MGB_CNML_CHECK(cnmlCreateQuantizedParam(&filter_compute_quant, | |||
filter_position, filter_scale, | |||
filter_offset)); | |||
// setup conv op computing datatype | |||
MGB_CNML_CHECK(cnmlSetOperationComputingDataType( | |||
conv_op, conv_filter_tensor, CNML_DATA_INT8, | |||
filter_compute_quant)); | |||
// setup conv op computing layout | |||
MGB_CNML_CHECK(cnmlSetOperationComputingLayout(conv_op, CNML_NCHW)); | |||
// setup active op using relu fuction | |||
MGB_CNML_CHECK(cnmlCreateActiveOp(&relu_op, CNML_ACTIVE_RELU, | |||
conv_output_tensor, | |||
relu_output_tensor)); | |||
// setup fusion op, fuse addpad op and conv op to fusion op | |||
MGB_CNML_CHECK(cnmlCreateFusionOp(&fusion_op)); | |||
MGB_CNML_CHECK(cnmlFuseOp(conv_op, fusion_op)); | |||
MGB_CNML_CHECK(cnmlFuseOp(relu_op, fusion_op)); | |||
MGB_CNML_CHECK(cnmlSetTensorDimMutable(conv_input_tensor, | |||
&batch_size_changable, 4)); | |||
MGB_CNML_CHECK(cnmlSetTensorDimMutable(relu_output_tensor, | |||
&batch_size_changable, 4)); | |||
// setup the input and output of the fusion op | |||
MGB_CNML_CHECK(cnmlAddFusionInput(fusion_op, conv_input_tensor)); | |||
MGB_CNML_CHECK(cnmlAddFusionOutput(fusion_op, relu_output_tensor)); | |||
// set operation corenum | |||
MGB_CNML_CHECK(cnmlSetFusionOpCorenum(fusion_op, core_num)); | |||
// set operation coreversion | |||
MGB_CNML_CHECK(cnmlSetFusionOpCoreVersion( | |||
fusion_op, static_cast<cnmlCoreVersion_t>(core_version))); | |||
// set batch size changable | |||
MGB_CNML_CHECK(cnmlSetFusionOpBatchsizeChangable(fusion_op, | |||
batch_size_changable)); | |||
// compile fusion op | |||
MGB_CNML_CHECK(cnmlCompileFusionOp_V2(fusion_op)); | |||
// delete tensors | |||
MGB_CNML_CHECK(cnmlDestroyTensor(&conv_filter_tensor)); | |||
MGB_CNML_CHECK(cnmlDestroyTensor(&conv_bias_tensor)); | |||
MGB_CNML_CHECK(cnmlDestroyTensor(&conv_output_tensor)); | |||
// delete quant param | |||
MGB_CNML_CHECK(cnmlDestroyQuantizedParam(&input_quant_param)); | |||
// destory filter compute quant-param | |||
MGB_CNML_CHECK(cnmlDestroyQuantizedParam(&filter_compute_quant)); | |||
// delete conv params | |||
MGB_CNML_CHECK(cnmlDestroyConvOpParam(&conv_param)); | |||
// delete base ops and fusion op | |||
MGB_CNML_CHECK(cnmlDestroyBaseOp(&conv_op)); | |||
MGB_CNML_CHECK(cnmlDestroyBaseOp(&relu_op)); | |||
built = true; | |||
} | |||
SmallVector<uint8_t> get_serialized_model() { | |||
if (!built) | |||
build(); | |||
MGB_CNML_CHECK(cnmlCreateModel(&model, "mlp")); | |||
MGB_CNML_CHECK(cnmlAddFusionOpToModel(model, fusion_op, "subnet0")); | |||
std::string fname = | |||
ssprintf("./output/CambriconRuntimeOprTest.%s.mlu", | |||
batch_size_changable ? "MutableBatchSize" | |||
: "ImmutableBatchSize"); | |||
MGB_CNML_CHECK(cnmlSaveModel(model, fname.c_str())); | |||
int len = 0; | |||
MGB_CNRT_CHECK(cnrtGetModelSize(fname.c_str(), &len)); | |||
SmallVector<uint8_t> buf(len); | |||
FILE* fstream = fopen(fname.c_str(), "rb"); | |||
if (fstream != nullptr) { | |||
auto ret = fread(buf.data(), 1, len, fstream); | |||
mgb_assert(static_cast<int>(ret) == len); | |||
} | |||
auto fstream_close = [](FILE* fp) { fclose(fp); }; | |||
std::unique_ptr<FILE, decltype(fstream_close)> fstream_holder{ | |||
fstream, fstream_close}; | |||
return std::move(buf); | |||
} | |||
void do_inference(void** input_mlu_ptrs, void** output_mlu_ptrs) { | |||
if (!built) | |||
build(); | |||
auto&& cnrt_env = CompNodeEnv::from_comp_node(cn).cnrt_env(); | |||
cnrt_env.activate(); | |||
auto&& queue = cnrt_env.queue; | |||
cnrtNotifier_t start, end; | |||
MGB_CNRT_CHECK(cnrtCreateNotifier(&start)); | |||
MGB_CNRT_CHECK(cnrtCreateNotifier(&end)); | |||
MGB_CNRT_CHECK(cnrtPlaceNotifier(start, queue)); | |||
MGB_CNML_CHECK(cnmlComputeFusionOpForward_V4( | |||
fusion_op, &conv_input_tensor, input_mlu_ptrs, 1, | |||
&relu_output_tensor, output_mlu_ptrs, 1, queue, nullptr)); | |||
MGB_CNRT_CHECK(cnrtPlaceNotifier(end, queue)); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(queue)); | |||
float time = 0.f; | |||
MGB_CNRT_CHECK(cnrtNotifierDuration(start, end, &time)); | |||
printf("inference time = %.2fs\n", time * 1e-3); | |||
MGB_CNRT_CHECK(cnrtDestroyNotifier(&start)); | |||
MGB_CNRT_CHECK(cnrtDestroyNotifier(&end)); | |||
} | |||
}; | |||
} // namespace | |||
TEST(TestCambriconRuntimeOpr, Basic) { | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
auto cn = CompNode::load("cambricon0"); | |||
CnmlModelContext ctx{cn, false}; | |||
// prepare parameter for addpad and conv | |||
const int ni = 16, ci = 64, hi = 32, wi = 32; | |||
const int no = 16, co = 64, ho = 32, wo = 32; | |||
// count tensor nums | |||
int conv_input_count = ni * hi * wi * ci; | |||
int relu_output_count = no * ho * wo * co; | |||
// prepare cpu origin data | |||
std::vector<float> conv_input_cpu_data(conv_input_count); | |||
std::vector<float> relu_output_cpu_data(relu_output_count); | |||
// prepare input data for addpad | |||
unsigned int seed = time(0); | |||
for (int index = 0; index < conv_input_count; ++index) { | |||
conv_input_cpu_data[index] = ((rand_r(&seed) % 100 / 100.0) - 0.5) / 2; | |||
} | |||
// prepare cpu data to converts to mlu memory | |||
std::vector<int16_t> conv_input_cpu(conv_input_count); | |||
std::vector<int16_t> relu_output_cpu(relu_output_count); | |||
MGB_CNRT_CHECK(cnrtCastDataType(conv_input_cpu_data.data(), CNRT_FLOAT32, | |||
conv_input_cpu.data(), CNRT_FLOAT16, | |||
conv_input_count, nullptr)); | |||
auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; | |||
void* input_mlu_ptr; | |||
void* output_mlu_ptr; | |||
// malloc mlu mem for fusion input and output | |||
MGB_CNRT_CHECK( | |||
cnrtMalloc(&input_mlu_ptr, conv_input_count * sizeof(int16_t))); | |||
MGB_CNRT_CHECK( | |||
cnrtMalloc(&output_mlu_ptr, relu_output_count * sizeof(int16_t))); | |||
// memory copy cpu->mlu | |||
MGB_CNRT_CHECK(cnrtMemcpy(input_mlu_ptr, conv_input_cpu.data(), | |||
conv_input_count * sizeof(int16_t), | |||
CNRT_MEM_TRANS_DIR_HOST2DEV)); | |||
std::unique_ptr<void, decltype(mlu_deleter)> input_holder{input_mlu_ptr, | |||
mlu_deleter}; | |||
std::unique_ptr<void, decltype(mlu_deleter)> output_holder{output_mlu_ptr, | |||
mlu_deleter}; | |||
ctx.do_inference(&input_mlu_ptr, &output_mlu_ptr); | |||
// result memory copy cnml->cpu | |||
// memory copy cpu->mlu | |||
MGB_CNRT_CHECK(cnrtMemcpy(relu_output_cpu.data(), output_mlu_ptr, | |||
relu_output_count * sizeof(int16_t), | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
MGB_CNRT_CHECK(cnrtCastDataType(relu_output_cpu.data(), CNRT_FLOAT16, | |||
relu_output_cpu_data.data(), CNRT_FLOAT32, | |||
relu_output_count, nullptr)); | |||
auto buf = ctx.get_serialized_model(); | |||
std::shared_ptr<HostTensorND> input = std::make_shared<HostTensorND>( | |||
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()}); | |||
memcpy(reinterpret_cast<void*>(input->ptr<dt_float16>()), | |||
conv_input_cpu.data(), conv_input_count * sizeof(int16_t)); | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make(*graph, input); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", | |||
{x}, false)[0]; | |||
HostTensorND output(cn, {no, co, ho, wo}, dtype::Float16()); | |||
auto func = graph->compile({make_callback_copy(y, output)}); | |||
func->execute(); | |||
HostTensorND out_cnml(cn, {no, co, ho, wo}, dtype::Float32()), | |||
out_mgb(cn, {no, co, ho, wo}, dtype::Float32()); | |||
memcpy(out_cnml.ptr<float>(), relu_output_cpu_data.data(), | |||
relu_output_count * sizeof(float)); | |||
MGB_CNRT_CHECK(cnrtCastDataType( | |||
reinterpret_cast<void*>(output.ptr<dt_float16>()), CNRT_FLOAT16, | |||
out_mgb.ptr<float>(), CNRT_FLOAT32, relu_output_count, nullptr)); | |||
MGB_ASSERT_TENSOR_NEAR(out_cnml, out_mgb, 1e-4); | |||
} | |||
TEST(TestCambriconRuntimeOpr, BatchSizeChangable) { | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
auto cn = CompNode::load("cambricon0"); | |||
CnmlModelContext ctx{cn, true}; | |||
// prepare parameter for addpad and conv | |||
size_t ni = 16, ci = 64, hi = 32, wi = 32; | |||
size_t no = 16, co = 64, ho = 32, wo = 32; | |||
// count tensor nums | |||
int conv_input_count = ni * hi * wi * ci; | |||
int relu_output_count = no * ho * wo * co; | |||
// prepare cpu origin data | |||
std::vector<float> conv_input_cpu_data(conv_input_count); | |||
std::vector<float> relu_output_cpu_data(relu_output_count); | |||
// prepare input data for addpad | |||
unsigned int seed = time(0); | |||
for (int index = 0; index < conv_input_count; ++index) { | |||
conv_input_cpu_data[index] = ((rand_r(&seed) % 100 / 100.0) - 0.5) / 2; | |||
} | |||
// prepare cpu data to converts to mlu memory | |||
std::vector<int16_t> conv_input_cpu(conv_input_count); | |||
std::vector<int16_t> relu_output_cpu(relu_output_count); | |||
MGB_CNRT_CHECK(cnrtCastDataType(conv_input_cpu_data.data(), CNRT_FLOAT32, | |||
conv_input_cpu.data(), CNRT_FLOAT16, | |||
conv_input_count, nullptr)); | |||
auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; | |||
void* input_mlu_ptr; | |||
void* output_mlu_ptr; | |||
// malloc mlu mem for fusion input and output | |||
MGB_CNRT_CHECK( | |||
cnrtMalloc(&input_mlu_ptr, conv_input_count * sizeof(int16_t))); | |||
MGB_CNRT_CHECK( | |||
cnrtMalloc(&output_mlu_ptr, relu_output_count * sizeof(int16_t))); | |||
// memory copy cpu->mlu | |||
MGB_CNRT_CHECK(cnrtMemcpy(input_mlu_ptr, conv_input_cpu.data(), | |||
conv_input_count * sizeof(int16_t), | |||
CNRT_MEM_TRANS_DIR_HOST2DEV)); | |||
std::unique_ptr<void, decltype(mlu_deleter)> input_holder{input_mlu_ptr, | |||
mlu_deleter}; | |||
std::unique_ptr<void, decltype(mlu_deleter)> output_holder{output_mlu_ptr, | |||
mlu_deleter}; | |||
ctx.do_inference(&input_mlu_ptr, &output_mlu_ptr); | |||
// result memory copy cnml->cpu | |||
// memory copy cpu->mlu | |||
MGB_CNRT_CHECK(cnrtMemcpy(relu_output_cpu.data(), output_mlu_ptr, | |||
relu_output_count * sizeof(int16_t), | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
MGB_CNRT_CHECK(cnrtCastDataType(relu_output_cpu.data(), CNRT_FLOAT16, | |||
relu_output_cpu_data.data(), CNRT_FLOAT32, | |||
relu_output_count, nullptr)); | |||
// cnml inference finished | |||
{ | |||
// change batch size | |||
ni = 32, no = 32; | |||
auto buf = ctx.get_serialized_model(); | |||
std::shared_ptr<HostTensorND> input = std::make_shared<HostTensorND>( | |||
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()}); | |||
memcpy(reinterpret_cast<void*>(input->ptr<dt_float16>()), | |||
conv_input_cpu.data(), conv_input_count * sizeof(int16_t)); | |||
memcpy(reinterpret_cast<void*>(input->ptr<dt_float16>() + | |||
conv_input_count), | |||
conv_input_cpu.data(), conv_input_count * sizeof(int16_t)); | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make(*graph, input); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), | |||
"subnet0", {x}, true)[0]; | |||
HostTensorND output(cn, {no, co, ho, wo}, dtype::Float16()); | |||
auto func = graph->compile({make_callback_copy(y, output)}); | |||
func->execute(); | |||
HostTensorND out_cnml(cn, {no, co, ho, wo}, dtype::Float32()), | |||
out_mgb(cn, {no, co, ho, wo}, dtype::Float32()); | |||
memcpy(out_cnml.ptr<float>(), relu_output_cpu_data.data(), | |||
relu_output_count * sizeof(float)); | |||
memcpy(out_cnml.ptr<float>() + relu_output_count, | |||
relu_output_cpu_data.data(), relu_output_count * sizeof(float)); | |||
MGB_CNRT_CHECK(cnrtCastDataType( | |||
reinterpret_cast<void*>(output.ptr<dt_float16>()), CNRT_FLOAT16, | |||
out_mgb.ptr<float>(), CNRT_FLOAT32, 2 * relu_output_count, | |||
nullptr)); | |||
MGB_ASSERT_TENSOR_NEAR(out_cnml, out_mgb, 1e-4); | |||
} | |||
{ | |||
// change batch size | |||
ni = 1, no = 1; | |||
conv_input_count = ni * hi * wi * ci; | |||
relu_output_count = no * ho * wo * co; | |||
auto buf = ctx.get_serialized_model(); | |||
std::shared_ptr<HostTensorND> input = std::make_shared<HostTensorND>( | |||
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()}); | |||
memcpy(reinterpret_cast<void*>(input->ptr<dt_float16>()), | |||
conv_input_cpu.data(), conv_input_count * sizeof(int16_t)); | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make(*graph, input); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), | |||
"subnet0", {x}, true)[0]; | |||
HostTensorND output(cn, {no, co, ho, wo}, dtype::Float16()); | |||
auto func = graph->compile({make_callback_copy(y, output)}); | |||
func->execute(); | |||
HostTensorND out_cnml(cn, {no, co, ho, wo}, dtype::Float32()), | |||
out_mgb(cn, {no, co, ho, wo}, dtype::Float32()); | |||
memcpy(out_cnml.ptr<float>(), relu_output_cpu_data.data(), | |||
relu_output_count * sizeof(float)); | |||
MGB_CNRT_CHECK(cnrtCastDataType( | |||
reinterpret_cast<void*>(output.ptr<dt_float16>()), CNRT_FLOAT16, | |||
out_mgb.ptr<float>(), CNRT_FLOAT32, relu_output_count, | |||
nullptr)); | |||
MGB_ASSERT_TENSOR_NEAR(out_cnml, out_mgb, 1e-4); | |||
} | |||
} | |||
TEST(TestCambriconRuntimeOpr, Serialization) { | |||
using namespace serialization; | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
auto cn = CompNode::load("cambricon0"); | |||
CnmlModelContext ctx{cn, true}; | |||
auto buf = ctx.get_serialized_model(); | |||
// prepare parameter for addpad and conv | |||
const int ni = 1, ci = 64, hi = 32, wi = 32; | |||
std::shared_ptr<HostTensorND> input = std::make_shared<HostTensorND>( | |||
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()}); | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make(*graph, input); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", | |||
{x}, true)[0]; | |||
auto fname = output_file("CambriconRuntimeOprTest"); | |||
auto dump = [&]() { | |||
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str())); | |||
auto rst = dumper->dump({y}); | |||
ASSERT_EQ(rst.outputs.size(), 1u); | |||
}; | |||
auto load = [&]() { | |||
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str())); | |||
auto rst = loader->load(); | |||
ASSERT_EQ(rst.output_var_list.size(), 1u); | |||
}; | |||
dump(); | |||
load(); | |||
} | |||
// TODO: this test will be improved later due to peer copy for cambricon is not | |||
// correct | |||
TEST(TestCambriconRuntimeOpr, MultipleDevice) { | |||
REQUIRE_CAMBRICON_DEVICE(2); | |||
auto cn0 = CompNode::load("cambricon0"); | |||
auto cn1 = CompNode::load("cambricon1"); | |||
CnmlModelContext ctx{cn0, true}; | |||
auto buf = ctx.get_serialized_model(); | |||
const int ni = 8, ci = 64, hi = 32, wi = 32; | |||
auto graph = ComputingGraph::make(); | |||
auto xv = std::make_shared<DeviceTensorND>(cn0, TensorShape{ni, ci, hi, wi}, | |||
dtype::Float16()); | |||
auto x = opr::SharedDeviceTensor::make(*graph, xv), | |||
x1 = opr::Copy::make(x, cn1); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", | |||
{x}, true)[0], | |||
y1 = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", | |||
{x1}, true)[0]; | |||
HostTensorND host_y, host_y1; | |||
auto func = graph->compile( | |||
{make_callback_copy(y, host_y), make_callback_copy(y1, host_y1)}); | |||
func->execute(); | |||
} | |||
TEST(TestCambriconRuntimeOpr, Profiling) { | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
auto cn = CompNode::load("cambricon0"); | |||
CnmlModelContext ctx{cn, true}; | |||
auto buf = ctx.get_serialized_model(); | |||
const int ni = 8, ci = 64, hi = 32, wi = 32; | |||
HostTensorGenerator<dtype::Float16, RandomDistribution::GAUSSIAN> gen( | |||
dt_float16(0.f), dt_float16(1.f)); | |||
auto input = gen({ni, ci, hi, wi}, cn); | |||
auto graph = ComputingGraph::make(); | |||
GraphProfiler profiler{graph.get()}; | |||
auto x = opr::Host2DeviceCopy::make(*graph, input); | |||
auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", | |||
{x}, true)[0]; | |||
HostTensorND output; | |||
graph->options().var_sanity_check_first_run = false; | |||
auto func = graph->compile({make_callback_copy(y, output)}); | |||
func->execute(); | |||
profiler.to_json_full(func.get()) | |||
->writeto_fpath(output_file("cambricon_runtime_opr_profile.json")); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,508 @@ | |||
/** | |||
* \file src/core/impl/comp_node/atlas/comp_node.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "./comp_node.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include <memory> | |||
#include <string> | |||
using namespace mgb; | |||
#if MGB_ATLAS | |||
#include "megbrain/common.h" | |||
#include "megbrain/comp_node/alloc.h" | |||
#include "megbrain/utils//timer.h" | |||
#include "megcore_atlas.h" | |||
#include <cctype> | |||
#include <cstdio> | |||
#include <acl/acl.h> | |||
#include <limits> | |||
using AtlasCompNodeImpl = AtlasCompNode::CompNodeImpl; | |||
/* ===================== AtlasCompNodeImpl ===================== */ | |||
class AtlasCompNode::CompNodeImpl final : public CompNode::Impl { | |||
MGB_DYN_TYPE_OBJ_FINAL_DECL; | |||
friend class EventImpl; | |||
friend class AtlasCompNode; | |||
struct DeviceInfo; | |||
struct StaticData; | |||
static StaticData* sd; | |||
static Spinlock sd_mtx; | |||
//! set to true when m_locator is assigned; set to false if async init | |||
//! failed | |||
bool m_initialized = false; | |||
Locator m_locator, m_locator_logical; | |||
DeviceInfo* m_device_info; | |||
std::unique_ptr<Event> m_sync_event; | |||
Spinlock m_sync_event_mtx; | |||
void activate() { m_env.atlas_env().activate(); } | |||
void init(const Locator& locator, const Locator& locator_logical); | |||
void fini(); | |||
//! return whether global finalized, and print warning in such case | |||
static inline bool check_global_finalized(); | |||
//! enable peer copy from dev0 to dev1 | |||
static void enable_peer_access(int dev0, int dev1); | |||
static void static_free_device(ImplBase* self, void* ptr) { | |||
static_cast<CompNodeImpl*>(self)->free_device(ptr); | |||
} | |||
static void static_free_host(ImplBase* self, void* ptr) { | |||
static_cast<CompNodeImpl*>(self)->free_host(ptr); | |||
} | |||
public: | |||
CompNodeImpl() : Impl(static_free_device, static_free_host) {} | |||
void* alloc_device(size_t size) override { | |||
activate(); | |||
void* addr; | |||
MGB_ATLAS_CHECK(aclrtMalloc(&addr, size, ACL_MEM_MALLOC_HUGE_FIRST)); | |||
return addr; | |||
} | |||
void free_device(void* ptr) { | |||
if (check_global_finalized()) | |||
return; | |||
activate(); | |||
MGB_ATLAS_CHECK(aclrtFree(ptr)); | |||
} | |||
void* alloc_host(size_t size) override { | |||
void* ptr; | |||
MGB_ATLAS_CHECK(aclrtMallocHost(&ptr, size)); | |||
return ptr; | |||
} | |||
void free_host(void* ptr) { MGB_ATLAS_CHECK(aclrtFreeHost(ptr)); } | |||
void copy_to_host(void* host_ptr, const void* device_ptr, | |||
size_t size) override { | |||
activate(); | |||
MGB_ATLAS_CHECK(aclrtMemcpyAsync(host_ptr, size, device_ptr, size, | |||
ACL_MEMCPY_DEVICE_TO_HOST, | |||
m_env.atlas_env().stream)); | |||
} | |||
void copy_to_device(void* device_ptr, const void* host_ptr, | |||
size_t size) override { | |||
activate(); | |||
MGB_ATLAS_CHECK(aclrtMemcpy(device_ptr, size, host_ptr, size, | |||
ACL_MEMCPY_HOST_TO_DEVICE)); | |||
} | |||
void peer_copy_to(Impl* dest_impl, void* dest, const void* src, | |||
size_t size) override; | |||
size_t get_mem_addr_alignment() override { | |||
return m_env.property().mem_alignment; | |||
} | |||
std::unique_ptr<Event> create_event(size_t flags) override; | |||
void sync() override; | |||
MemNode mem_node() override; | |||
size_t get_mem_padding() override { return 32; } | |||
std::pair<size_t, size_t> get_mem_status_bytes() override { | |||
return {std::numeric_limits<size_t>::max(), | |||
std::numeric_limits<size_t>::max()}; | |||
} | |||
Locator locator() override { return m_locator; } | |||
Locator locator_logical() override { return m_locator_logical; } | |||
}; | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(AtlasCompNode::CompNodeImpl); | |||
struct AtlasCompNodeImpl::DeviceInfo { | |||
int dev_num = -1; | |||
void init(const CompNodeEnv& env) { | |||
auto&& atlas_env = env.atlas_env(); | |||
atlas_env.activate(); | |||
dev_num = atlas_env.device; | |||
} | |||
void fini() { | |||
MGB_ATLAS_CHECK(aclrtResetDevice(dev_num)); | |||
} | |||
}; | |||
struct AtlasCompNodeImpl::StaticData { | |||
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; | |||
std::recursive_mutex mtx; | |||
AtlasCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | |||
DeviceInfo dev_info[MAX_NR_DEVICE]; | |||
int nr_node = 0, //!< number of loaded node[] | |||
nr_dev_used = 0; //!< number of used dev_info[] | |||
StaticData() {} | |||
~StaticData() { | |||
for (int i = 0; i < nr_node; ++i) | |||
node[i].fini(); | |||
for (int i = 0; i < nr_dev_used; ++i) | |||
dev_info[i].fini(); | |||
} | |||
}; | |||
AtlasCompNodeImpl::StaticData* AtlasCompNodeImpl::sd = nullptr; | |||
Spinlock AtlasCompNodeImpl::sd_mtx; | |||
void AtlasCompNodeImpl::init(const Locator& locator, | |||
const Locator& locator_logical) { | |||
m_locator = locator; | |||
m_locator_logical = locator_logical; | |||
m_initialized = true; | |||
CompNodeEnv::AtlasEnv atlas_env; | |||
atlas_env.device = locator.device; | |||
m_env.init_atlas(make_comp_node_from_impl(this), atlas_env); | |||
DeviceInfo* dev_info = nullptr; | |||
for (int i = 0; i < sd->nr_dev_used; ++i) { | |||
if (sd->dev_info[i].dev_num == locator.device) { | |||
dev_info = &sd->dev_info[i]; | |||
break; | |||
} | |||
} | |||
if (!dev_info) { | |||
dev_info = &sd->dev_info[sd->nr_dev_used]; | |||
dev_info->init(m_env); | |||
// note: add nr_dev_used only after init succeeds | |||
++sd->nr_dev_used; | |||
} | |||
m_device_info = dev_info; | |||
} | |||
void AtlasCompNodeImpl::fini() { | |||
if (!m_initialized) | |||
return; | |||
m_sync_event.reset(); | |||
m_env.fini(); | |||
m_initialized = false; | |||
m_device_info = nullptr; | |||
} | |||
void AtlasCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest, | |||
const void* src, size_t size) { | |||
if (dest_impl->same_type<AtlasCompNodeImpl>()) { | |||
auto&& dst_env = | |||
static_cast<AtlasCompNodeImpl*>(dest_impl)->m_env.atlas_env(); | |||
auto&& src_env = m_env.atlas_env(); | |||
activate(); | |||
if (dst_env.device == src_env.device) { | |||
MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size, | |||
ACL_MEMCPY_DEVICE_TO_DEVICE, | |||
dst_env.stream)); | |||
} else { | |||
mgb_throw(MegBrainError, | |||
"Atlas does not support peer copy between differents " | |||
"device."); | |||
} | |||
return; | |||
} | |||
mgb_assert(dest_impl->env().property().type == DeviceType::CPU, | |||
"cuda peer_copy_to only implemented for CPU"); | |||
auto copy = [this, dest, src, size]() { | |||
auto stream = m_env.atlas_env().stream; | |||
m_env.atlas_env().activate(); | |||
MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size, | |||
ACL_MEMCPY_DEVICE_TO_HOST, | |||
m_env.atlas_env().stream)); | |||
MGB_ATLAS_CHECK(aclrtSynchronizeStream(stream)); | |||
}; | |||
dest_impl->env().cpu_env().dispatch(copy); | |||
} | |||
MemNode AtlasCompNodeImpl::mem_node() { | |||
// m_device_info would be null before async init finishes; so we just return | |||
// a private pointer related to device number here | |||
return MemNode{sd->dev_info + m_locator.device}; | |||
} | |||
void AtlasCompNodeImpl::sync() { | |||
activate(); | |||
Event* event; | |||
{ | |||
MGB_LOCK_GUARD(m_sync_event_mtx); | |||
if (!m_sync_event) | |||
m_sync_event = create_event(0); | |||
event = m_sync_event.get(); | |||
} | |||
event->record(); | |||
event->host_wait(); | |||
} | |||
void AtlasCompNodeImpl::enable_peer_access(int dev0, int dev1) { | |||
MGB_MARK_USED_VAR(dev0); | |||
MGB_MARK_USED_VAR(dev1); | |||
mgb_throw(MegBrainError, | |||
"Atlas does not support peer copy between differents " | |||
"device."); | |||
} | |||
bool AtlasCompNodeImpl::check_global_finalized() { | |||
if (!sd) { | |||
static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT; | |||
if (!warn_printed.test_and_set()) { | |||
mgb_log_debug( | |||
"atlas comp node method called after global finalize"); | |||
} | |||
return true; | |||
} | |||
return false; | |||
} | |||
/* ===================== EventImpl ===================== */ | |||
/** | |||
* \warning Current we just use cpu timer to do record, later when the api of | |||
* ddk is ready, we change to normal event. | |||
*/ | |||
class AtlasCompNode::EventImpl final : public EventImplHelper { | |||
AtlasCompNodeImpl* const m_comp_node_impl; | |||
aclrtEvent m_atlas_event; | |||
bool m_init_finished = false; | |||
void do_record() override { | |||
m_comp_node_impl->activate(); | |||
auto &&env = m_comp_node_impl->m_env.atlas_env(); | |||
MGB_ATLAS_CHECK(aclrtRecordEvent(m_atlas_event, env.stream)); | |||
} | |||
bool do_finished() override { | |||
m_comp_node_impl->activate(); | |||
aclrtEventStatus status; | |||
MGB_ATLAS_CHECK(aclrtQueryEvent(m_atlas_event, &status)); | |||
if (status == ACL_EVENT_STATUS_COMPLETE) | |||
return true; | |||
if (status == ACL_EVENT_STATUS_NOT_READY) | |||
return false; | |||
mgb_throw(AtlasError, "invalid event status: %d", int(status)); | |||
} | |||
void host_wait_cv() override { | |||
MGB_ATLAS_CHECK(aclrtSynchronizeEvent(m_atlas_event)); | |||
} | |||
double do_elapsed_time_until(EventImplHelper& end) override { | |||
m_comp_node_impl->activate(); | |||
float ret = 0.0; | |||
MGB_ATLAS_CHECK(aclrtEventElapsedTime(&ret, m_atlas_event, | |||
static_cast<EventImpl&>(end).m_atlas_event)); | |||
return static_cast<double>(ret) * 1e-3; | |||
} | |||
void do_device_wait_by(Impl* cn_impl) override; | |||
public: | |||
EventImpl(AtlasCompNodeImpl* comp_node_impl, size_t create_flags) | |||
: EventImplHelper(comp_node_impl, create_flags), | |||
m_comp_node_impl{comp_node_impl} { | |||
m_comp_node_impl->activate(); | |||
MGB_ATLAS_CHECK(aclrtCreateEvent(&m_atlas_event)); | |||
m_init_finished = true; | |||
} | |||
~EventImpl() { | |||
if (m_init_finished) { | |||
MGB_TRY { MGB_ATLAS_CHECK(aclrtDestroyEvent(m_atlas_event)); } | |||
MGB_CATCH(MegBrainError & exc, { | |||
mgb_log_error("failed to destroy cuda event: %s", exc.what()); | |||
}) | |||
} | |||
} | |||
}; | |||
std::unique_ptr<CompNode::Event> AtlasCompNodeImpl::create_event(size_t flags) { | |||
return std::make_unique<EventImpl>(this, flags); | |||
} | |||
void AtlasCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) { | |||
if (cn_impl->dyn_typeinfo() == AtlasCompNodeImpl::typeinfo()) { | |||
auto imp = static_cast<AtlasCompNodeImpl*>(cn_impl); | |||
auto stream = imp->m_env.atlas_env().stream; | |||
imp->activate(); | |||
MGB_ATLAS_CHECK(aclrtStreamWaitEvent(stream, m_atlas_event)); | |||
return; | |||
} | |||
if (cn_impl->env().property().type == DeviceType::CPU) { | |||
auto waiter = [this]() { | |||
MGB_ATLAS_CHECK(aclrtSynchronizeEvent(m_atlas_event)); | |||
}; | |||
cn_impl->add_callback(std::move(waiter)); | |||
return; | |||
} | |||
mgb_throw(MegBrainError, "unimplemented event device_wait_by config"); | |||
} | |||
/* ===================== AtlasCompNode static methods ===================== */ | |||
bool AtlasCompNode::available() { | |||
return true; | |||
} | |||
void AtlasCompNode::finalize() { | |||
if (AtlasCompNodeImpl::sd) { | |||
sync_all(); | |||
auto ptr = AtlasCompNodeImpl::sd; | |||
AtlasCompNodeImpl::sd = nullptr; | |||
ptr->~StaticData(); | |||
} | |||
} | |||
CompNode::Impl* AtlasCompNode::load_atlas(const Locator& locator, | |||
const Locator& locator_logical) { | |||
auto&& sdptr = AtlasCompNodeImpl::sd; | |||
{ | |||
MGB_LOCK_GUARD(AtlasCompNodeImpl::sd_mtx); | |||
if (!sdptr) { | |||
// use static storage so object can be safely accessed even after | |||
// global finalize | |||
using T = AtlasCompNodeImpl::StaticData; | |||
static std::aligned_storage_t<sizeof(T), alignof(T)> storage; | |||
sdptr = new (&storage) T; | |||
} | |||
} | |||
auto&& sd = *sdptr; | |||
MGB_LOCK_GUARD(sd.mtx); | |||
CompNodeImpl* available_node = nullptr; | |||
for (int i = 0; i < sd.nr_node; ++i) { | |||
auto&& cur = sd.node[i]; | |||
if (cur.m_initialized) { | |||
if (cur.m_locator_logical == locator_logical) { | |||
return &cur; | |||
} | |||
} else { | |||
available_node = &cur; | |||
} | |||
} | |||
if (!available_node) { | |||
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, | |||
"too many CompNode allocated"); | |||
mgb_assert(locator.device < sd.MAX_NR_COMP_NODE, | |||
"device number too large"); | |||
available_node = &sd.node[sd.nr_node++]; | |||
} | |||
mgb_assert(!available_node->m_initialized); | |||
available_node->init(locator, locator_logical); | |||
log_comp_node_created(locator, locator_logical); | |||
return available_node; | |||
} | |||
void AtlasCompNode::sync_all() { | |||
auto sd = AtlasCompNodeImpl::sd; | |||
if (!sd) | |||
return; | |||
for (int i = 0;; ++i) { | |||
// ensure async init finished | |||
CompNodeEnv* env; | |||
{ | |||
MGB_LOCK_GUARD(sd->mtx); | |||
if (i >= sd->nr_node) { | |||
break; | |||
} | |||
env = &sd->node[i].env(); | |||
} | |||
env->atlas_env(); | |||
} | |||
MGB_LOCK_GUARD(sd->mtx); | |||
MGB_ATLAS_CHECK(aclrtSynchronizeDevice()); | |||
} | |||
void AtlasCompNode::foreach (thin_function<void(CompNode)> callback) { | |||
auto sd = AtlasCompNodeImpl::sd; | |||
if (!sd) | |||
return; | |||
for (int i = 0;; ++i) { | |||
CompNode cur; | |||
{ | |||
MGB_LOCK_GUARD(sd->mtx); | |||
if (i >= sd->nr_node) | |||
return; | |||
cur = make_comp_node_from_impl(&sd->node[i]); | |||
} | |||
callback(cur); | |||
} | |||
} | |||
size_t AtlasCompNode::get_device_count() { | |||
static uint32_t cnt = 0; | |||
static Spinlock mtx; | |||
MGB_LOCK_GUARD(mtx); | |||
if (cnt == 0) { | |||
uint32_t dev_cnt = 0; | |||
auto ret = aclrtGetDeviceCount(&dev_cnt); | |||
if (ret != ACL_ERROR_NONE) { | |||
mgb_log_error("aclrtGetDeviceCountfaild: %s (err %d)", | |||
::megcore::atlas::get_error_str(ret), | |||
static_cast<int>(ret)); | |||
cnt = 0; | |||
} | |||
cnt = dev_cnt; | |||
} | |||
return cnt; | |||
} | |||
#else | |||
bool AtlasCompNode::available() { | |||
return false; | |||
} | |||
void AtlasCompNode::foreach (thin_function<void(CompNode)>) {} | |||
void AtlasCompNode::finalize() {} | |||
size_t AtlasCompNode::get_device_count() { | |||
return 0; | |||
} | |||
AtlasCompNode::Impl* AtlasCompNode::load_atlas(const Locator&, const Locator&) { | |||
mgb_throw(MegBrainError, "atlas disabled at compile time"); | |||
} | |||
void AtlasCompNode::sync_all() {} | |||
#endif // MGB_ATLAS | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file src/core/impl/comp_node/atlas/comp_node.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include <mutex> | |||
#include "../impl_helper.h" | |||
namespace mgb { | |||
class AtlasCompNode final : public CompNodeImplHelper { | |||
public: | |||
static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; | |||
class CompNodeImpl; | |||
class EventImpl; | |||
//! whether cuda comp node is available | |||
static bool available(); | |||
static void foreach (thin_function<void(CompNode)> callback); | |||
static void finalize(); | |||
static size_t get_device_count(); | |||
static Impl* load_atlas(const Locator& locator, | |||
const Locator& locator_logical); | |||
static void sync_all(); | |||
}; | |||
} // namespace mgb | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,759 @@ | |||
/** | |||
* \file src/core/impl/comp_node/cambricon/comp_node.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "./comp_node.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/utils/thread.h" | |||
#include <string> | |||
using namespace mgb; | |||
#if MGB_CAMBRICON | |||
#include "megbrain/comp_node/alloc.h" | |||
#include <cctype> | |||
#include <cstdio> | |||
#include <thread> | |||
#include <cndev.h> | |||
#include <cnrt.h> | |||
using CambriconCompNodeImpl = CambriconCompNode::CompNodeImpl; | |||
namespace { | |||
size_t get_min_system_memory(size_t available) { | |||
// taken from src/core/impl/cuda/comp_node.cpp | |||
if (available < (1u << 31)) { | |||
// 225MiB | |||
return 225 * 1024 * 1024; | |||
} else { | |||
// max(300 MiB, 0.05 * available) | |||
return std::max<size_t>(300 * 1024 * 1024, available / 20); | |||
} | |||
} | |||
} // anonymous namespace | |||
/* ======================= CambriconRawAlloctor ======================*/ | |||
namespace mgb { | |||
namespace mem_alloc { | |||
class CambriconRawAlloctor final : public RawAllocator { | |||
public: | |||
void* alloc(size_t size) override { | |||
void* addr; | |||
cnrtRet_t ret = cnrtMalloc(&addr, size); | |||
if (ret == CNRT_RET_SUCCESS) { | |||
mgb_assert(addr); | |||
return addr; | |||
} | |||
auto msg = mgb_ssprintf_log( | |||
"cnrtMalloc failed while requesting %zd bytes (%.3fMiB) of " | |||
"memory; error: %s", | |||
size, size / (1024.0 * 1024), cnrtGetErrorStr(ret)); | |||
msg.append(CnrtError::get_cnrt_extra_info()); | |||
mgb_throw_raw(MemAllocError{msg}); | |||
} | |||
void free(void* ptr) override { | |||
cnrtRet_t ret = cnrtFree(ptr); | |||
if (ret == CNRT_RET_SUCCESS) | |||
return; | |||
auto msg = ssprintf("cnrtFree failed for %p: %s", ptr, | |||
cnrtGetErrorStr(ret)); | |||
msg.append(CnrtError::get_cnrt_extra_info()); | |||
mgb_throw_raw(MemAllocError{msg}); | |||
} | |||
void get_mem_info(size_t& free, size_t& tot) override; | |||
}; | |||
class CambriconDeviceRuntimePolicy : public DeviceRuntimePolicy { | |||
public: | |||
CompNode::DeviceType device_type() override { | |||
return CompNode::DeviceType::CAMBRICON; | |||
} | |||
void set_device(int device) override { | |||
cnrtDev_t dev; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev, device)); | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev)); | |||
} | |||
void device_synchronize(int device) override { | |||
cnrtDev_t dev; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev, device)); | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev)); | |||
MGB_CNRT_CHECK(cnrtSyncDevice()); | |||
} | |||
}; | |||
/* ====================== DevMemAlloc ================================*/ | |||
std::unique_ptr<DevMemAlloc> DevMemAlloc::make_cambricon_alloc() { | |||
return std::make_unique<FwdDevMemAlloc>( | |||
std::make_shared<CambriconRawAlloctor>()); | |||
} | |||
} // namespace mem_alloc | |||
} // namespace mgb | |||
/* ====================== CambriconCompNodeImpl ======================*/ | |||
class CambriconCompNode::CompNodeImpl final : public CompNode::Impl { | |||
MGB_DYN_TYPE_OBJ_FINAL_DECL; | |||
friend class EventImpl; | |||
friend class CambriconCompNode; | |||
friend class mgb::mem_alloc::CambriconRawAlloctor; | |||
struct DeviceInfo; | |||
struct StaticData; | |||
static StaticData* sd; | |||
static Spinlock sd_mtx; | |||
//! set to true when m_locator is assigned; set to false if init | |||
//! failed | |||
bool m_initialized = false; | |||
Locator m_locator, m_locator_logical; | |||
mem_alloc::StreamMemAlloc* m_mem_alloc; | |||
DeviceInfo* m_device_info; | |||
cnrtDev_t m_dev; | |||
void activate() { m_env.cnrt_env().activate(); } | |||
void init(const Locator& locator, const Locator& locator_logical); | |||
void fini(); | |||
static inline bool check_global_finalized(); | |||
//! enable peer copy from dev0 to dev1 | |||
static bool enable_peer_access(int dev0, int dev1); | |||
static void static_free_device(ImplBase* self, void* ptr) { | |||
static_cast<CompNodeImpl*>(self)->free_device(ptr); | |||
} | |||
static void static_free_host(ImplBase* self, void* ptr) { | |||
static_cast<CompNodeImpl*>(self)->free_host(ptr); | |||
} | |||
public: | |||
CompNodeImpl() : Impl(static_free_device, static_free_host) {} | |||
void* alloc_device(size_t size) override { | |||
activate(); | |||
return m_mem_alloc->alloc(size); | |||
} | |||
void free_device(void* ptr); | |||
void* alloc_host(size_t size) override { | |||
activate(); | |||
void* ptr; | |||
MGB_CNRT_CHECK(cnrtMallocHost(&ptr, size, CNRT_MEMTYPE_DEFAULT)); | |||
return ptr; | |||
} | |||
void free_host(void* ptr) { | |||
if (!check_global_finalized()) { | |||
activate(); | |||
} | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(m_dev)); | |||
MGB_CNRT_CHECK(cnrtFreeHost(ptr)); | |||
} | |||
void copy_to_host(void* host_ptr, const void* device_ptr, | |||
size_t size) override { | |||
activate(); | |||
MGB_CNRT_CHECK(cnrtMemcpyAsync(host_ptr, const_cast<void*>(device_ptr), | |||
size, m_env.cnrt_env().queue, | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
} | |||
void copy_to_device(void* device_ptr, const void* host_ptr, | |||
size_t size) override { | |||
activate(); | |||
MGB_CNRT_CHECK(cnrtMemcpyAsync(device_ptr, const_cast<void*>(host_ptr), | |||
size, m_env.cnrt_env().queue, | |||
CNRT_MEM_TRANS_DIR_HOST2DEV)); | |||
} | |||
void peer_copy_to(Impl* dest_impl, void* dest, const void* src, | |||
size_t size) override; | |||
size_t get_mem_addr_alignment() override { | |||
return m_env.property().mem_alignment; | |||
} | |||
std::unique_ptr<Event> create_event(size_t flags) override; | |||
void sync() override; | |||
MemNode mem_node() override; | |||
std::pair<size_t, size_t> get_mem_status_bytes() override { | |||
m_env.cnrt_env().activate(); | |||
cndevMemoryInfo_t mem_info; | |||
MGB_CNDEV_CHECK( | |||
cndevGetMemoryUsage(&mem_info, m_env.cnrt_env().device)); | |||
size_t tot, used, free; | |||
constexpr size_t mb2size = 1024 * 1024; | |||
tot = static_cast<size_t>(mem_info.PhysicalMemoryTotal) * mb2size; | |||
used = static_cast<size_t>(mem_info.PhysicalMemoryUsed) * mb2size; | |||
free = tot - used + m_mem_alloc->get_free_memory_dev().tot; | |||
return {tot, free}; | |||
} | |||
Locator locator() override { return m_locator; } | |||
Locator locator_logical() override { return m_locator_logical; } | |||
}; | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CambriconCompNode::CompNodeImpl); | |||
struct CambriconCompNodeImpl::DeviceInfo { | |||
int dev_num = -1; | |||
cnrtDev_t dev; | |||
std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc; | |||
bool init_done() const { return mem_alloc.get(); } | |||
void init(const CompNodeEnv& env); | |||
// unlike cuda, we have to set device first, then release device memory | |||
void fini() { | |||
cnrtSetCurrentDevice(dev); | |||
return mem_alloc.reset(); | |||
} | |||
size_t get_mem_reserve_size(); | |||
}; | |||
struct CambriconCompNodeImpl::StaticData { | |||
static constexpr int MAX_NR_COMP_NODE = 4096, MAX_NR_DEVICE = 64; | |||
std::recursive_mutex mtx; | |||
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | |||
CambriconCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | |||
DeviceInfo dev_info[MAX_NR_DEVICE]; | |||
int nr_node = 0, nr_dev_used = 0; | |||
StaticData() { | |||
prealloc_config.max_overhead = 0; | |||
prealloc_config.alignment = 1; | |||
} | |||
~StaticData() { | |||
for (int i = 0; i < nr_node; ++i) | |||
node[i].fini(); | |||
for (int i = 0; i < nr_dev_used; ++i) | |||
dev_info[i].fini(); | |||
} | |||
}; | |||
CambriconCompNodeImpl::StaticData* CambriconCompNodeImpl::sd = nullptr; | |||
Spinlock CambriconCompNodeImpl::sd_mtx; | |||
void CambriconCompNodeImpl::init(const Locator& locator, | |||
const Locator& locator_logical) { | |||
m_locator = locator; | |||
m_locator_logical = locator_logical; | |||
m_initialized = true; | |||
auto on_succ = [this](cnrtQueue_t queue) { | |||
auto locator = m_locator; | |||
log_comp_node_created(locator, m_locator_logical); | |||
MGB_LOCK_GUARD(sd->mtx); | |||
DeviceInfo* dev_info = nullptr; | |||
for (int i = 0; i < sd->nr_dev_used; ++i) { | |||
if (sd->dev_info[i].dev_num == locator.device) { | |||
dev_info = &sd->dev_info[i]; | |||
break; | |||
} | |||
} | |||
if (!dev_info) { | |||
dev_info = &sd->dev_info[sd->nr_dev_used]; | |||
dev_info->init(m_env); | |||
++sd->nr_dev_used; | |||
} | |||
m_device_info = dev_info; | |||
m_mem_alloc = | |||
dev_info->mem_alloc->add_stream(static_cast<void*>(queue)); | |||
m_dev = m_device_info->dev; | |||
}; | |||
auto on_error = [this](std::exception&) { | |||
MGB_LOCK_GUARD(sd->mtx); | |||
m_initialized = false; | |||
}; | |||
m_env.init_cnrt(locator.device, make_comp_node_from_impl(this), | |||
{on_succ, on_error}); | |||
} | |||
void CambriconCompNodeImpl::fini() { | |||
if (!m_initialized) | |||
return; | |||
m_env.fini(); | |||
m_mem_alloc = nullptr; | |||
m_device_info = nullptr; | |||
m_initialized = false; | |||
} | |||
void CambriconCompNodeImpl::free_device(void* ptr) { | |||
if (check_global_finalized()) | |||
return; | |||
activate(); | |||
m_mem_alloc->free(ptr); | |||
} | |||
void CambriconCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest, | |||
const void* src, size_t size) { | |||
if (dest_impl->same_type<CambriconCompNodeImpl>()) { | |||
auto&& dst_env = static_cast<CambriconCompNodeImpl*>(dest_impl) | |||
->m_env.cnrt_env(); | |||
auto&& src_env = m_env.cnrt_env(); | |||
activate(); | |||
if (dst_env.device == src_env.device) { | |||
// remark: transfering data from device to device does not | |||
// support async | |||
sync(); | |||
dest_impl->sync(); | |||
MGB_CNRT_CHECK(cnrtMemcpy(dest, const_cast<void*>(src), size, | |||
CNRT_MEM_TRANS_DIR_DEV2DEV)); | |||
} else { | |||
mgb_throw_if( | |||
!enable_peer_access(src_env.device, dst_env.device) || | |||
!enable_peer_access(dst_env.device, src_env.device), | |||
CnrtError, | |||
"directly memory access is not available for " | |||
"src=%d,dst=%d", | |||
src_env.device, dst_env.device); | |||
sync(); | |||
dest_impl->sync(); | |||
MGB_CNRT_CHECK(cnrtMemcpyPeer(dest, dst_env.device, | |||
const_cast<void*>(src), | |||
src_env.device, size)); | |||
} | |||
return; | |||
} | |||
mgb_assert(dest_impl->env().property().type == DeviceType::CPU, | |||
"cnrt peer_copy_to only implemented for CPU"); | |||
auto copy = [this, dest, src, size]() { | |||
m_env.cnrt_env().activate(); | |||
auto queue = m_env.cnrt_env().queue; | |||
MGB_CNRT_CHECK(cnrtMemcpyAsync(dest, const_cast<void*>(src), size, | |||
queue, CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(queue)); | |||
}; | |||
dest_impl->env().cpu_env().dispatch(copy); | |||
} | |||
MemNode CambriconCompNodeImpl::mem_node() { | |||
return MemNode{sd->dev_info + m_locator.device}; | |||
} | |||
void CambriconCompNodeImpl::sync() { | |||
activate(); | |||
// remark: CNRT does not provide interface like cudaEventQuery to test | |||
// whether an event is finished. so we just call the cnrtSyncQueue | |||
MGB_CNRT_CHECK(cnrtSyncQueue(m_env.cnrt_env().queue)); | |||
} | |||
bool CambriconCompNodeImpl::enable_peer_access(int dev0, int dev1) { | |||
static bool queried_enabled[StaticData::MAX_NR_DEVICE] | |||
[StaticData::MAX_NR_DEVICE]; | |||
if (queried_enabled[dev0][dev1]) | |||
return queried_enabled[dev0][dev1]; | |||
static std::mutex global_lock; | |||
MGB_LOCK_GUARD(global_lock); | |||
unsigned int can = 0; | |||
MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, dev0, dev1)); | |||
if (can) | |||
mgb_log("device(%d) can directly access memories on device(%d)", dev0, | |||
dev1); | |||
queried_enabled[dev0][dev1] = can; | |||
return can; | |||
} | |||
/* ================== CambriconCompNodeImpl::DeviceInfo ===============*/ | |||
void CambriconCompNodeImpl::DeviceInfo::init(const CompNodeEnv& env) { | |||
mgb_assert(!mem_alloc); | |||
auto&& cnenv = env.cnrt_env(); | |||
cnenv.activate(); | |||
dev_num = cnenv.device; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev, dev_num)); | |||
// remark: Because free_device will be called after global finalize, so the | |||
// implementation of mem_alloc should handle the deallocation of memories | |||
// allocated by the mem_alloc. As a result, we should use the DevMemAlloc | |||
// instead of FwdDevMemAlloc. | |||
#if 0 | |||
// forward cnrtMalloc | |||
mem_alloc = mem_alloc::DevMemAlloc::make_cambricon_alloc(); | |||
#else | |||
auto reserve_size = get_mem_reserve_size(); | |||
mem_alloc = mem_alloc::DevMemAlloc::make( | |||
dev_num, reserve_size, | |||
std::make_shared<mem_alloc::CambriconRawAlloctor>(), | |||
std::make_shared<mem_alloc::CambriconDeviceRuntimePolicy>()); | |||
mem_alloc->prealloc_config(sd->prealloc_config); | |||
auto align = env.property().mem_alignment; | |||
mem_alloc->alignment(align); | |||
cnrtDeviceInfo_t device_info; | |||
MGB_CNRT_CHECK(cnrtGetDeviceInfo(&device_info, dev_num)); | |||
mgb_log("cambricon: card%d: name=`%s' dyn_mem_reserve=%.2fMiB " | |||
"alignment=0x%zx", | |||
dev_num, device_info.device_name, reserve_size / 1024.0 / 1024, | |||
align); | |||
#endif | |||
} | |||
size_t CambriconCompNodeImpl::DeviceInfo::get_mem_reserve_size() { | |||
if (auto setting = MGB_GETENV("MGB_CAMBRICON_RESERVE_MEMORY")) { | |||
if (!strncmp(setting, "b:", 2)) { | |||
return std::stoull(setting + 2); | |||
} | |||
size_t tot, free; | |||
cndevMemoryInfo_t mem_info; | |||
MGB_CNDEV_CHECK(cndevGetMemoryUsage(&mem_info, dev_num)); | |||
constexpr size_t mb2size = 1024 * 1024; | |||
tot = static_cast<size_t>(mem_info.PhysicalMemoryTotal) * mb2size; | |||
size_t used = | |||
static_cast<size_t>(mem_info.PhysicalMemoryUsed) * mb2size; | |||
free = tot - used; | |||
return free - get_min_system_memory(free); | |||
} else { | |||
return 0; | |||
} | |||
} | |||
bool CambriconCompNodeImpl::check_global_finalized() { | |||
if (!sd) { | |||
static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT; | |||
if (!warn_printed.test_and_set()) { | |||
mgb_log_warn( | |||
"cambricon comp node method called after global finalize"); | |||
} | |||
return true; | |||
} | |||
return false; | |||
} | |||
/* ================== CambriconCompNodeImpl::EventImpl ================*/ | |||
class CambriconCompNode::EventImpl final : public EventImplHelper { | |||
bool m_placed_notifier = false; | |||
bool m_sync_queue_called = false; | |||
bool m_init_finished = false; | |||
cnrtNotifier_t m_cnrt_notifier; | |||
CambriconCompNodeImpl* cambricon_comp_node_impl() const { | |||
return static_cast<CambriconCompNodeImpl*>(m_comp_node_impl); | |||
} | |||
void do_record() override { | |||
m_sync_queue_called = false; | |||
cambricon_comp_node_impl()->activate(); | |||
auto&& env = cambricon_comp_node_impl()->m_env.cnrt_env(); | |||
if (!m_placed_notifier) { | |||
MGB_CNRT_CHECK(cnrtPlaceNotifier(m_cnrt_notifier, env.queue)); | |||
m_placed_notifier = true; | |||
} | |||
} | |||
void call_sync_queue() { | |||
mgb_assert(m_placed_notifier); | |||
if (!m_sync_queue_called) { | |||
cambricon_comp_node_impl()->activate(); | |||
auto&& env = cambricon_comp_node_impl()->m_env.cnrt_env(); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(env.queue)); | |||
m_sync_queue_called = true; | |||
} | |||
} | |||
bool do_finished() override { | |||
call_sync_queue(); | |||
return true; | |||
} | |||
void host_wait_cv() override { | |||
mgb_assert(m_placed_notifier); | |||
cambricon_comp_node_impl()->activate(); | |||
auto&& env = cambricon_comp_node_impl()->m_env.cnrt_env(); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(env.queue)); | |||
} | |||
double do_elapsed_time_until(EventImplHelper& end) override { | |||
cambricon_comp_node_impl()->activate(); | |||
auto&& env = cambricon_comp_node_impl()->m_env.cnrt_env(); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(env.queue)); | |||
float ret = 0.f; | |||
MGB_CNRT_CHECK(cnrtNotifierDuration( | |||
m_cnrt_notifier, static_cast<EventImpl&>(end).m_cnrt_notifier, | |||
&ret)); | |||
return static_cast<double>(ret) * 1e-3; | |||
} | |||
void do_device_wait_by(Impl* cn_impl) override; | |||
public: | |||
EventImpl(CambriconCompNodeImpl* comp_node_impl, size_t create_flags) | |||
: EventImplHelper(comp_node_impl, create_flags) { | |||
cambricon_comp_node_impl()->activate(); | |||
MGB_CNRT_CHECK(cnrtCreateNotifier(&m_cnrt_notifier)); | |||
m_init_finished = true; | |||
} | |||
~EventImpl() { | |||
if (m_init_finished) { | |||
MGB_TRY { MGB_CNRT_CHECK(cnrtDestroyNotifier(&m_cnrt_notifier)); } | |||
MGB_CATCH(MegBrainError & exc, { | |||
mgb_log_error("failed to destroy cnrt notifier: %s", | |||
exc.what()); | |||
}) | |||
} | |||
} | |||
}; | |||
std::unique_ptr<CompNode::Event> CambriconCompNodeImpl::create_event( | |||
size_t flags) { | |||
return std::make_unique<EventImpl>(this, flags); | |||
} | |||
void CambriconCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) { | |||
if (cn_impl->env().property().type == DeviceType::CAMBRICON) { | |||
auto imp = static_cast<CambriconCompNodeImpl*>(cn_impl); | |||
auto queue = imp->m_env.cnrt_env().queue; | |||
imp->activate(); | |||
MGB_CNRT_CHECK(cnrtSyncQueue(queue)); | |||
return; | |||
} | |||
if (cn_impl->env().property().type == DeviceType::CPU) { | |||
auto waiter = [this]() { | |||
cambricon_comp_node_impl()->activate(); | |||
auto queue = cambricon_comp_node_impl()->m_env.cnrt_env().queue; | |||
MGB_CNRT_CHECK(cnrtSyncQueue(queue)); | |||
}; | |||
cn_impl->add_callback(std::move(waiter)); | |||
return; | |||
} | |||
mgb_throw(MegBrainError, "unimplemented event device_wait_by config"); | |||
} | |||
/* ================== CambriconCompNode static methods ================*/ | |||
bool CambriconCompNode::available() { | |||
CompNodeEnv::CnrtEnv::init(); | |||
static int result = -1; | |||
static Spinlock mtx; | |||
MGB_LOCK_GUARD(mtx); | |||
if (result == -1) { | |||
unsigned int dev_num = 0; | |||
auto err = cnrtGetDeviceCount(&dev_num); | |||
result = err == CNRT_RET_SUCCESS && dev_num >= 1; | |||
if (!result) { | |||
mgb_log_warn("cambricon unavailable: %d(%s) dev_num=%u", | |||
static_cast<int>(err), cnrtGetErrorStr(err), dev_num); | |||
} | |||
} | |||
return result; | |||
} | |||
void CambriconCompNode::finalize() { | |||
if (CambriconCompNodeImpl::sd) { | |||
sync_all(); | |||
auto ptr = CambriconCompNodeImpl::sd; | |||
CambriconCompNodeImpl::sd = nullptr; | |||
ptr->~StaticData(); | |||
} | |||
} | |||
CompNode::Impl* CambriconCompNode::load_cambricon( | |||
const Locator& locator, const Locator& locator_logical) { | |||
int nr_devs = get_device_count(); | |||
mgb_assert(locator.device >= 0 && locator.device < nr_devs, | |||
"request device%d out of range [0, %d)", locator.device, | |||
nr_devs); | |||
auto&& sdptr = CambriconCompNodeImpl::sd; | |||
{ | |||
MGB_LOCK_GUARD(CambriconCompNodeImpl::sd_mtx); | |||
if (!sdptr) { | |||
using T = CambriconCompNodeImpl::StaticData; | |||
static std::aligned_storage_t<sizeof(T), alignof(T)> storage; | |||
sdptr = new(&storage)T; | |||
} | |||
} | |||
auto&& sd = *sdptr; | |||
MGB_LOCK_GUARD(sd.mtx); | |||
CompNodeImpl* available_node = nullptr; | |||
for (int i = 0; i < sd.nr_node; ++i) { | |||
auto&& cur = sd.node[i]; | |||
if (cur.m_initialized) { | |||
if (cur.m_locator_logical == locator_logical) { | |||
return &cur; | |||
} | |||
} else { | |||
available_node = &cur; | |||
} | |||
} | |||
if (!available_node) { | |||
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, | |||
"too many CompNode allocated"); | |||
mgb_assert(locator.device < sd.MAX_NR_COMP_NODE, | |||
"device number too large"); | |||
available_node = &sd.node[sd.nr_node++]; | |||
} | |||
mgb_assert(!available_node->m_initialized); | |||
available_node->init(locator, locator_logical); | |||
return available_node; | |||
} | |||
void CambriconCompNode::try_coalesce_all_free_memory() { | |||
auto sd = CambriconCompNodeImpl::sd; | |||
if (!sd) | |||
return; | |||
size_t size = 0; | |||
for (int i = 0; i < sd->nr_dev_used; ++i) { | |||
size += sd->dev_info[i] | |||
.mem_alloc->gather_stream_free_blk_and_release_full(); | |||
} | |||
if (size) { | |||
mgb_log_debug("%zu bytes freed by try_coalesce_all_free_memory()", | |||
size); | |||
} | |||
} | |||
void CambriconCompNode::sync_all() { | |||
auto sd = CambriconCompNodeImpl::sd; | |||
if (!sd) | |||
return; | |||
for (int i = 0;; ++i) { | |||
CompNodeEnv* env; | |||
{ | |||
MGB_LOCK_GUARD(sd->mtx); | |||
if (i >= sd->nr_node) { | |||
break; | |||
} | |||
env = &sd->node[i].env(); | |||
} | |||
env->cnrt_env(); | |||
} | |||
MGB_LOCK_GUARD(sd->mtx); | |||
for (int i = 0; i < sd->nr_dev_used; ++i) { | |||
cnrtDev_t dev; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev, sd->dev_info[i].dev_num)); | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev)); | |||
MGB_CNRT_CHECK(cnrtSyncDevice()); | |||
} | |||
} | |||
void CambriconCompNode::foreach (thin_function<void(CompNode)> callback) { | |||
auto sd = CambriconCompNodeImpl::sd; | |||
if (!sd) | |||
return; | |||
for (int i = 0;; ++i) { | |||
CompNode cur; | |||
{ | |||
MGB_LOCK_GUARD(sd->mtx); | |||
if (i >= sd->nr_node) | |||
return; | |||
cur = make_comp_node_from_impl(&sd->node[i]); | |||
} | |||
callback(cur); | |||
} | |||
} | |||
size_t CambriconCompNode::get_device_count() { | |||
CompNodeEnv::CnrtEnv::init(); | |||
static int cnt = -1; | |||
static Spinlock mtx; | |||
MGB_LOCK_GUARD(mtx); | |||
if (cnt == -1) { | |||
unsigned int dev_cnt = 0; | |||
auto ret = cnrtGetDeviceCount(&dev_cnt); | |||
if (ret != CNRT_RET_SUCCESS) { | |||
mgb_log_error("cnrtGetDeviceCount faild: %s (err %d)", | |||
cnrtGetErrorStr(ret), int(ret)); | |||
cnt = 0; | |||
} | |||
cnt = dev_cnt; | |||
mgb_assert(cnt >= 0); | |||
} | |||
return cnt; | |||
} | |||
void mgb::mem_alloc::CambriconRawAlloctor::get_mem_info(size_t& free, | |||
size_t& tot) { | |||
auto sd = CambriconCompNodeImpl::sd; | |||
int device = -1; | |||
{ | |||
cnrtDev_t dev; | |||
MGB_CNRT_CHECK(cnrtGetCurrentDevice(&dev)); | |||
for (int i = 0; i < sd->nr_dev_used; ++i) { | |||
if (sd->dev_info[i].dev == dev) { | |||
device = sd->dev_info[i].dev_num; | |||
break; | |||
} | |||
} | |||
} | |||
mgb_assert(device >= 0, | |||
"current device has not been initialized in static data"); | |||
cndevMemoryInfo_t mem_info; | |||
auto ret = cndevGetMemoryUsage(&mem_info, device); | |||
if (ret == CNDEV_SUCCESS) { | |||
constexpr size_t mb2size = 1024 * 1024; | |||
tot = static_cast<size_t>(mem_info.PhysicalMemoryTotal) * mb2size; | |||
size_t used = | |||
static_cast<size_t>(mem_info.PhysicalMemoryUsed) * mb2size; | |||
free = tot - used; | |||
return; | |||
} | |||
auto msg = | |||
ssprintf("cndevGetMemoryUsage faild %s", cndevGetErrorString(ret)); | |||
mgb_throw_raw(MemAllocError{msg}); | |||
} | |||
#else | |||
bool CambriconCompNode::available() { | |||
return false; | |||
} | |||
void CambriconCompNode::try_coalesce_all_free_memory() {} | |||
void CambriconCompNode::foreach (thin_function<void(CompNode)>) {} | |||
void CambriconCompNode::finalize() {} | |||
size_t CambriconCompNode::get_device_count() { | |||
return 0; | |||
} | |||
CambriconCompNode::Impl* CambriconCompNode::load_cambricon(const Locator&, | |||
const Locator&) { | |||
mgb_throw(MegBrainError, "cambricon disabled at compile time"); | |||
} | |||
void CambriconCompNode::sync_all() {} | |||
#undef err | |||
#endif // MGB_CAMBRICON | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -0,0 +1,39 @@ | |||
/** | |||
* \file src/core/impl/comp_node/cambricon/comp_node.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include "../impl_helper.h" | |||
namespace mgb { | |||
class CambriconCompNode final: public CompNodeImplHelper { | |||
public: | |||
static constexpr Flag sm_flag = | |||
Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; | |||
class CompNodeImpl; | |||
class EventImpl; | |||
//! whether cambricon comp node is available | |||
static bool available(); | |||
static void try_coalesce_all_free_memory(); | |||
static void foreach(thin_function<void(CompNode)> callback); | |||
static void finalize(); | |||
static size_t get_device_count(); | |||
static Impl* load_cambricon( | |||
const Locator &locator, const Locator &locator_logical); | |||
static void sync_all(); | |||
}; | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -15,6 +15,8 @@ | |||
#include "./cuda/comp_node.h" | |||
#include "./cpu/comp_node.h" | |||
#include "./cambricon/comp_node.h" | |||
#include "./atlas/comp_node.h" | |||
#include <cstring> | |||
#include <atomic> | |||
@@ -40,6 +42,10 @@ namespace { | |||
return "gpu"; | |||
case DT::CPU: | |||
return "cpu"; | |||
case DT::ATLAS: | |||
return "atlas"; | |||
case DT::CAMBRICON: | |||
return "cambricon"; | |||
case DT::MULTITHREAD: | |||
return "multithread"; | |||
default: | |||
@@ -145,7 +151,20 @@ CompNode::Locator CompNode::Locator::parse(const std::string &id) { | |||
DeviceType dev_type; | |||
// parse dev_type | |||
if (ptr[0] == 'm') { | |||
if (ptr[0] == 'a') { | |||
if (strncmp(ptr, "atlas", 5)) { | |||
err(); | |||
} | |||
dev_type = DeviceType::ATLAS; | |||
ptr += 5; | |||
} | |||
else if (ptr[2] == 'm') { | |||
if (strncmp(ptr, "cambricon", 9)) { | |||
err(); | |||
} | |||
dev_type = DeviceType::CAMBRICON; | |||
ptr += 9; | |||
} else if (ptr[0] == 'm') { | |||
if (strncmp(ptr, "multithread", 11)) { | |||
err(); | |||
} | |||
@@ -478,6 +497,13 @@ CompNode CompNode::load(const Locator& locator_physical, | |||
case DeviceType::CPU: | |||
ret = CpuCompNode::load_cpu(locator_physical, locator_logical); | |||
break; | |||
case DeviceType::ATLAS: | |||
ret = AtlasCompNode::load_atlas(locator_physical, locator_logical); | |||
break; | |||
case DeviceType::CAMBRICON: | |||
ret = CambriconCompNode::load_cambricon(locator_physical, | |||
locator_logical); | |||
break; | |||
default: | |||
mgb_throw(MegBrainError, "bad device type"); | |||
} | |||
@@ -496,20 +522,27 @@ void CompNode::finalize() { | |||
comp_node_detail::DepedentObjList::invoke_callback_and_clean(); | |||
CudaCompNode::finalize(); | |||
CpuCompNode::finalize(); | |||
CambriconCompNode::finalize(); | |||
AtlasCompNode::finalize(); | |||
} | |||
void CompNode::try_coalesce_all_free_memory() { | |||
CudaCompNode::try_coalesce_all_free_memory(); | |||
CambriconCompNode::try_coalesce_all_free_memory(); | |||
} | |||
void CompNode::sync_all() { | |||
CudaCompNode::sync_all(); | |||
CpuCompNode::sync_all(); | |||
CambriconCompNode::sync_all(); | |||
AtlasCompNode::sync_all(); | |||
} | |||
void CompNode::foreach(thin_function<void(CompNode)> callback) { | |||
CudaCompNode::foreach(callback); | |||
CpuCompNode::foreach(callback); | |||
CambriconCompNode::foreach(callback); | |||
AtlasCompNode::foreach(callback); | |||
} | |||
size_t CompNode::get_device_count(DeviceType type, bool warn) { | |||
@@ -519,6 +552,10 @@ size_t CompNode::get_device_count(DeviceType type, bool warn) { | |||
case DeviceType::MULTITHREAD: | |||
case DeviceType::CPU: | |||
return CpuCompNode::get_device_count(); | |||
case DeviceType::CAMBRICON: | |||
return CambriconCompNode::get_device_count(); | |||
case DeviceType::ATLAS: | |||
return AtlasCompNode::get_device_count(); | |||
default: | |||
mgb_throw(MegBrainError, "bad device type"); | |||
} | |||
@@ -534,6 +571,12 @@ bool CompNode::contain_flag(DeviceType device_type, Flag flag) { | |||
case DeviceType::CPU: | |||
cn_flag = CpuCompNode::sm_flag; | |||
break; | |||
case DeviceType::CAMBRICON: | |||
cn_flag = CambriconCompNode::sm_flag; | |||
break; | |||
case DeviceType::ATLAS: | |||
cn_flag = AtlasCompNode::sm_flag; | |||
break; | |||
default: | |||
mgb_throw(MegBrainError, "unexpected device type"); | |||
} | |||
@@ -528,9 +528,23 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase { | |||
Impl *dest_impl, void *dest, | |||
const void *src, size_t size) override { | |||
if (!dest_impl->same_type<CpuCompNode::CompNodeImpl>()) { | |||
if (dest_impl->env().property().type == DeviceType::ATLAS) { | |||
#if MGB_ATLAS | |||
dest_impl->copy_to_device(dest, src, size); | |||
return; | |||
#else | |||
mgb_throw(MegBrainError, | |||
"Atlas comp_node used but " | |||
"MGB_ATLAS not enabled"); | |||
#endif | |||
} else { | |||
mgb_assert(locator().device == Locator::DEVICE_CPU_DEFAULT, | |||
"currently only peer copy from default cpu comp nodes " | |||
"is implemented"); | |||
"currently only peer copy from default cpu comp " | |||
"nodes " | |||
"is implemented"); | |||
} | |||
} | |||
dest_impl->copy_to_device(dest, src, size); | |||
} | |||
@@ -841,12 +855,22 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( | |||
auto type = cn_impl->env().property().type; | |||
mgb_throw_if(type != CompNode::DeviceType::CPU | |||
&& type != CompNode::DeviceType::CUDA | |||
&& type != CompNode::DeviceType::ATLAS | |||
, | |||
MegBrainError, | |||
"currently CPU can only wait for CPU, CUDA" | |||
"currently CPU can only wait for CPU, CUDA, ATLAS" | |||
); | |||
} | |||
if (cn_impl->env().property().type == CompNode::DeviceType::ATLAS) { | |||
#if MGB_ATLAS | |||
return m_comp_node_impl->sync(); | |||
#else | |||
mgb_throw(MegBrainError, | |||
"Atlas comp_node used but MGB_ATLAS not enabled"); | |||
#endif | |||
} | |||
auto version = m_record_nr_req.load(std::memory_order_relaxed); | |||
mgb_assert(version, "device wait on non-recorded event"); | |||
@@ -22,6 +22,15 @@ | |||
#endif | |||
#endif | |||
#if MGB_CAMBRICON | |||
#include "megcore_cambricon.h" | |||
#endif | |||
#if MGB_ATLAS | |||
#include "acl/acl.h" | |||
#include "megcore_atlas.h" | |||
#endif | |||
using namespace mgb; | |||
/* =================== MegDNNHandle =================== */ | |||
@@ -54,6 +63,28 @@ MegDNNHandle::MegDNNHandle(const CompNodeEnv& env) { | |||
init = true; | |||
} | |||
#endif | |||
#if MGB_CAMBRICON | |||
if (env.property().type == CompNode::DeviceType::CAMBRICON) { | |||
CompNodeEnv::CnrtEnv::init_status.init(); | |||
megcore::createDeviceHandleWithGlobalInitStatus( | |||
&m_dev_hdl, env.cnrt_env().device, 0, true); | |||
megcore::createComputingHandleWithCambriconContext( | |||
&m_comp_hdl, m_dev_hdl, 0, {env.cnrt_env().queue}); | |||
init = true; | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
if (env.property().type == CompNode::DeviceType::ATLAS) { | |||
CompNodeEnv::AtlasEnv::init_status.init(); | |||
megcore::createAtlasDeviceHandleWithGlobalInitStatus( | |||
&m_dev_hdl, env.atlas_env().device, 0, true); | |||
megcore::createComputingHandleWithAtlasContext( | |||
&m_comp_hdl, m_dev_hdl, 0, {env.atlas_env().stream}); | |||
init = true; | |||
} | |||
#endif | |||
if (env.property().type == CompNode::DeviceType::CPU) { | |||
megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformCPU); | |||
@@ -175,6 +206,73 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
void mgb::_on_atlas_error(const char* expr, int err, const char* file, | |||
const char* func, int line) { | |||
mgb_throw(AtlasError, "atlas error %d: %s (%s at %s:%s:%d)", int(err), | |||
megcore::atlas::get_error_str(err), expr, file, func, line); | |||
} | |||
CompNodeEnv::AtlasEnv::InitStatus CompNodeEnv::AtlasEnv::init_status; | |||
void CompNodeEnv::init_atlas(CompNode comp_node, const AtlasEnv& env) { | |||
m_comp_node = comp_node; | |||
m_atlas_env = env; | |||
m_property.type = DeviceType::ATLAS; | |||
m_property.mem_alignment = 64; | |||
m_atlas_env.activate(); | |||
MGB_ATLAS_CHECK(aclrtCreateStream(&m_atlas_env.stream)); | |||
m_user_data_container = std::make_unique<UserDataContainer>(); | |||
mgb_assert(m_property.mem_alignment == | |||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
} | |||
#endif | |||
#if MGB_CAMBRICON | |||
const char* mgb::cnml_get_error_string(cnmlStatus_t err) { | |||
switch (err) { | |||
#define cb(_err) \ | |||
case _err: \ | |||
return #_err | |||
cb(CNML_STATUS_SUCCESS); | |||
cb(CNML_STATUS_NODEVICE); | |||
cb(CNML_STATUS_DOMAINERR); | |||
cb(CNML_STATUS_INVALIDARG); | |||
cb(CNML_STATUS_LENGTHERR); | |||
cb(CNML_STATUS_OUTOFRANGE); | |||
cb(CNML_STATUS_RANGEERR); | |||
cb(CNML_STATUS_OVERFLOWERR); | |||
cb(CNML_STATUS_UNDERFLOWERR); | |||
cb(CNML_STATUS_INVALIDPARAM); | |||
cb(CNML_STATUS_BADALLOC); | |||
cb(CNML_STATUS_BADTYPEID); | |||
cb(CNML_STATUS_BADCAST); | |||
cb(CNML_STATUS_UNSUPPORT); | |||
#undef cb | |||
} | |||
return "Unknown CNML error"; | |||
} | |||
void mgb::_on_cnrt_error(const char* expr, cnrtRet_t err, const char* file, | |||
const char* func, int line) { | |||
mgb_throw(CnrtError, "cnrt error %d: %s (%s at %s:%s:%d)", int(err), | |||
cnrtGetErrorStr(err), expr, file, func, line); | |||
} | |||
void mgb::_on_cndev_error(const char* expr, cndevRet_t err, const char* file, | |||
const char* func, int line) { | |||
mgb_throw(CndevError, "cndev error %d: %s (%s at %s:%s:%d)", int(err), | |||
cndevGetErrorString(err), expr, file, func, line); | |||
} | |||
void mgb::_on_cnml_error(const char* expr, cnmlStatus_t err, const char* file, | |||
const char* func, int line) { | |||
mgb_throw(CnmlError, "cnml error %d: %s (%s at %s:%s:%d)", int(err), | |||
cnml_get_error_string(err), expr, file, func, line); | |||
} | |||
#endif | |||
void CompNodeEnv::init_cpu(const CpuEnv& env, CompNode comp_node) { | |||
m_comp_node = comp_node; | |||
@@ -188,6 +286,41 @@ void CompNodeEnv::init_cpu(const CpuEnv& env, CompNode comp_node) { | |||
} | |||
#if MGB_CAMBRICON | |||
void CompNodeEnv::init_cnrt(int dev, CompNode comp_node, | |||
const ContinuationCtx<cnrtQueue_t>& cont) { | |||
m_comp_node = comp_node; | |||
m_cnrt_env.device = dev; | |||
m_property.type = DeviceType::CAMBRICON; | |||
MGB_CNRT_CHECK(cnrtGetDeviceInfo(&m_cnrt_env.device_info, dev)); | |||
// FIXME: doc doesn't describe the aligment requirement for device memory | |||
// address | |||
m_property.mem_alignment = 1u; | |||
// ensure exception safe | |||
bool queue_created = false; | |||
MGB_MARK_USED_VAR(queue_created); | |||
MGB_TRY { | |||
m_cnrt_env.activate(); | |||
MGB_CNRT_CHECK(cnrtCreateQueue(&m_cnrt_env.queue)); | |||
queue_created = true; | |||
m_user_data_container = std::make_unique<UserDataContainer>(); | |||
cont.next(m_cnrt_env.queue); | |||
// TODO: initialize megdnn handle | |||
mgb_assert(m_property.mem_alignment == | |||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
} | |||
MGB_CATCH(std::exception & exc, { | |||
mgb_log_error("cnrt init failed: %s", exc.what()); | |||
if (queue_created) { | |||
MGB_CNRT_CHECK(cnrtDestroyQueue(m_cnrt_env.queue)); | |||
} | |||
cont.err(exc); | |||
throw; | |||
}) | |||
} | |||
CompNodeEnv::CnrtEnv::InitStatus CompNodeEnv::CnrtEnv::init_status; | |||
#endif | |||
void CompNodeEnv::fini() { | |||
ensure_async_init_finished(); | |||
m_user_data_container.reset(); | |||
@@ -197,6 +330,19 @@ void CompNodeEnv::fini() { | |||
MGB_CUDA_CHECK(cudaStreamDestroy(m_cuda_env.stream)); | |||
} | |||
#endif | |||
#if MGB_CAMBRICON | |||
if (m_property.type == DeviceType::CAMBRICON) { | |||
m_cnrt_env.activate(); | |||
MGB_CNRT_CHECK(cnrtDestroyQueue(m_cnrt_env.queue)); | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
if (m_property.type == DeviceType::ATLAS) { | |||
m_atlas_env.activate(); | |||
MGB_ATLAS_CHECK(aclrtDestroyStream(m_atlas_env.stream)); | |||
} | |||
#endif | |||
} | |||
#if MGB_ENABLE_COMP_NODE_ASYNC_INIT | |||
@@ -71,6 +71,29 @@ std::string CudaError::get_cuda_extra_info() { | |||
#endif | |||
} | |||
AtlasError::AtlasError(const std::string &msg): | |||
SystemError(msg) | |||
{ | |||
} | |||
CnrtError::CnrtError(const std::string& msg) : SystemError(msg) { | |||
m_msg.append(get_cnrt_extra_info()); | |||
} | |||
std::string CnrtError::get_cnrt_extra_info() { | |||
#if MGB_CAMBRICON | |||
// get last error | |||
auto err = cnrtGetLastErr(); | |||
return ssprintf("(last_err=%d(%s))", err, cnrtGetErrorStr(err)); | |||
#else | |||
return "cnrt disabled at compile time"; | |||
#endif | |||
} | |||
CndevError::CndevError(const std::string& msg) : SystemError(msg) {} | |||
CnmlError::CnmlError(const std::string& msg) : SystemError(msg) {} | |||
bool mgb::has_uncaught_exception() { | |||
#if MGB_ENABLE_EXCEPTION | |||
@@ -124,7 +124,7 @@ StaticDeviceMemoryManager::make_default_impl() { | |||
#endif // MGB_THREAD_SAFE | |||
/* ==================== AsyncVarReleaser ==================== */ | |||
#if MGB_CUDA | |||
#if MGB_CUDA || MGB_ATLAS | |||
class VarNodeMemManager::AsyncVarReleaser { | |||
struct WaiterParam { | |||
CompNode cn; | |||
@@ -247,7 +247,7 @@ bool VarNodeMemManager::ImpureMemPlanManager::check_need_realloc() { | |||
VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | |||
m_owner_graph(graph), | |||
m_seq_mem_opt(graph) | |||
#if MGB_CUDA | |||
#if MGB_CUDA || MGB_ATLAS | |||
,m_asyn_var_releaser(new AsyncVarReleaser) | |||
#endif | |||
{ | |||
@@ -255,7 +255,7 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | |||
MGB_MARK_USED_VAR(ev); | |||
// async release is only used for sync between multiple comp nodes, and | |||
// does not wait for device to finish | |||
#if MGB_CUDA | |||
#if MGB_CUDA || MGB_ATLAS | |||
m_asyn_var_releaser->wait_release_finish(); | |||
#endif | |||
m_cpu_async_release_barrier.wait_zero(); | |||
@@ -296,8 +296,7 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | |||
graph->event().register_receiver_permanent<event::CompSeqExecError>( | |||
on_comp_seq_error); | |||
#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA \ | |||
) | |||
#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA || MGB_ATLAS) | |||
auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) { | |||
m_asyn_var_releaser->wait_release_finish(); | |||
}; | |||
@@ -1351,6 +1350,13 @@ void VarNodeMemManager::decr_var_mem_refcnt( | |||
m_asyn_var_releaser->add(dispatch_cn, var); | |||
break; | |||
#endif | |||
#if MGB_ATLAS | |||
case DT::ATLAS: | |||
{ | |||
m_asyn_var_releaser->add(dispatch_cn, var); | |||
break; | |||
} | |||
#endif | |||
default: | |||
mgb_throw(MegBrainError, | |||
"unsupported comp node in dynamic var shape: %s", | |||
@@ -437,7 +437,7 @@ class VarNodeMemManager { | |||
SyncableCounter m_cpu_async_release_barrier; | |||
#if MGB_CUDA | |||
#if MGB_CUDA || MGB_ATLAS | |||
//! release dynamic var on after compnode event finishes | |||
class AsyncVarReleaser; | |||
std::unique_ptr<AsyncVarReleaser> m_asyn_var_releaser; | |||
@@ -612,6 +612,12 @@ void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) { | |||
cudaMemsetAsync(ptr, val, size, env.cuda_env().stream)); | |||
break; | |||
#endif | |||
#if MGB_ATLAS | |||
case CompNode::DeviceType::ATLAS: | |||
MGB_ATLAS_CHECK(aclrtMemsetAsync(ptr, -1, val, size, | |||
env.atlas_env().stream)); | |||
break; | |||
#endif | |||
case CompNode::DeviceType::CPU: { | |||
auto fill = [ptr, size, val]() { std::memset(ptr, val, size); }; | |||
env.cpu_env().dispatch(fill); | |||
@@ -112,6 +112,8 @@ class CompNode { | |||
CUDA = 1, | |||
CPU = 2, | |||
CAMBRICON = 3, | |||
ATLAS = 9, | |||
MULTITHREAD, | |||
MAX_DEVICE_ID, | |||
}; | |||
@@ -44,6 +44,31 @@ | |||
#endif //MGB_ENABLE_LOGGING | |||
#endif //MGB_CUDA | |||
#if MGB_ATLAS | |||
#include "acl/acl.h" | |||
#include <atomic> | |||
#if MGB_ENABLE_LOGGING | |||
#define MGB_ATLAS_CHECK(expr) \ | |||
do { \ | |||
aclError __acl_check_code = (expr); \ | |||
if (!mgb_likely(__acl_check_code == ACL_ERROR_NONE)) { \ | |||
::mgb::_on_atlas_error(#expr, __acl_check_code, __FILE__, \ | |||
__func__, __LINE__); \ | |||
} \ | |||
} while (0) | |||
#else | |||
#define MGB_ATLAS_CHECK(expr) \ | |||
do { \ | |||
aclError __acl_check_code = (expr); \ | |||
if (!mgb_likely(__acl_check_code == ACL_ERROR_NONE)) { \ | |||
::mgb::_on_atlas_error(#expr, __acl_check_code, "", "", 1); \ | |||
} \ | |||
} while (0) | |||
#endif //MGB_ENABLE_LOGGING | |||
#endif // MGB_ATLAS | |||
//! whether to enable asynchronous initialization for CompNode and CompNodeEnv | |||
#define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA) | |||
@@ -61,6 +86,10 @@ | |||
#include "megbrain/utils/thin/function.h" | |||
namespace mgb { | |||
#if MGB_ATLAS | |||
[[noreturn]] void _on_atlas_error(const char* expr, aclError err, | |||
const char* file, const char* func, int line); | |||
#endif | |||
#if MGB_CUDA | |||
[[noreturn]] void _on_cuda_error(const char* expr, cudaError_t err, | |||
@@ -68,6 +97,16 @@ namespace mgb { | |||
#endif | |||
#if MGB_CAMBRICON | |||
const char* cnml_get_error_string(cnmlStatus_t err); | |||
[[noreturn]] void _on_cnrt_error(const char* expr, cnrtRet_t err, | |||
const char* file, const char* func, int line); | |||
[[noreturn]] void _on_cndev_error(const char* expr, cndevRet_t err, | |||
const char* file, const char* func, int line); | |||
[[noreturn]] void _on_cnml_error(const char* expr, cnmlStatus_t err, | |||
const char* file, const char* func, int line); | |||
#endif | |||
class CPUDispatcher : public MegcoreCPUDispatcher { | |||
public: | |||
using AffinityCallBack = thin_function<void(size_t)>; | |||
@@ -80,6 +119,7 @@ public: | |||
} | |||
}; | |||
using AtlasDispatcher = CPUDispatcher; | |||
/*! | |||
* \brief CompNode environment | |||
@@ -158,6 +198,17 @@ public: | |||
m_cuda_env.activate(); | |||
} | |||
#endif | |||
#if MGB_CAMBRICON | |||
if (m_property.type == DeviceType::CAMBRICON) { | |||
m_cnrt_env.activate(); | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
if (m_property.type == DeviceType::ATLAS) { | |||
m_atlas_env.activate(); | |||
} | |||
#endif | |||
} | |||
/*! | |||
@@ -199,6 +250,113 @@ public: | |||
const ContinuationCtx<cudaStream_t>& cont); | |||
#endif | |||
#if MGB_ATLAS | |||
struct AtlasEnv { | |||
int device = -1; | |||
aclrtStream stream = 0; | |||
struct InitStatus { | |||
bool initialized; | |||
Spinlock mtx; | |||
InitStatus() : initialized{false} {} | |||
void init() { | |||
MGB_LOCK_GUARD(mtx); | |||
if (!initialized) { | |||
auto acl_err = aclInit(nullptr); | |||
initialized = acl_err == ACL_ERROR_NONE; | |||
mgb_throw_if(!initialized, AtlasError, | |||
"acl initialize failed: (acl: %d)", | |||
static_cast<int>(acl_err)); | |||
} | |||
} | |||
~InitStatus() { | |||
MGB_LOCK_GUARD(mtx); | |||
if (initialized) { | |||
initialized = false; | |||
} | |||
} | |||
}; | |||
static InitStatus init_status; | |||
static void init() { | |||
init_status.init(); | |||
} | |||
void activate() const { | |||
init(); | |||
MGB_ATLAS_CHECK(aclrtSetDevice(device)); | |||
} | |||
}; | |||
const AtlasEnv& atlas_env() const { | |||
if (mgb_unlikely(m_property.type != DeviceType::ATLAS)) | |||
on_bad_device_type(DeviceType::ATLAS); | |||
ensure_async_init_finished(); | |||
return m_atlas_env; | |||
} | |||
//! init this as a atlas env synchronously | |||
void init_atlas(CompNode comp_node, const AtlasEnv& env); | |||
#endif | |||
#if MGB_CAMBRICON | |||
struct CnrtEnv { | |||
int device = -1; | |||
cnrtQueue_t queue = nullptr; | |||
cnrtDeviceInfo_t device_info; | |||
struct InitStatus { | |||
bool initialized; | |||
Spinlock mtx; | |||
InitStatus() : initialized{false} {} | |||
void init() { | |||
MGB_LOCK_GUARD(mtx); | |||
if (!initialized) { | |||
auto cnrt_err = cnrtInit(0); | |||
initialized = cnrt_err == CNRT_RET_SUCCESS; | |||
auto cndev_err = cndevInit(0); | |||
initialized &= cndev_err == CNDEV_SUCCESS; | |||
auto cnml_err = cnmlInit(0); | |||
initialized &= cnml_err == CNML_STATUS_SUCCESS; | |||
mgb_throw_if(!initialized, CnrtError, | |||
"cnrt/cndev/cnml initialize failed: (cnrt:%d, " | |||
"cndev:%d, cnml: %d)", | |||
static_cast<int>(cnrt_err), | |||
static_cast<int>(cndev_err), | |||
static_cast<int>(cnml_err)); | |||
} | |||
} | |||
~InitStatus() { | |||
if (initialized) { | |||
MGB_CNML_CHECK(cnmlExit()); | |||
MGB_CNDEV_CHECK(cndevRelease()); | |||
cnrtDestroy(); | |||
initialized = false; | |||
} | |||
} | |||
}; | |||
static InitStatus init_status; | |||
static void init() { | |||
init_status.init(); | |||
} | |||
void activate() const { | |||
init(); | |||
cnrtDev_t dev; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev, device)); | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev)); | |||
} | |||
}; | |||
const CnrtEnv& cnrt_env() const { | |||
if (mgb_unlikely(m_property.type != DeviceType::CAMBRICON)) | |||
on_bad_device_type(DeviceType::CAMBRICON); | |||
return m_cnrt_env; | |||
} | |||
void init_cnrt(int dev, CompNode comp_node, | |||
const ContinuationCtx<cnrtQueue_t>& cont); | |||
#endif | |||
struct CpuEnv { | |||
using Task = CPUDispatcher::Task; | |||
@@ -239,6 +397,12 @@ private: | |||
#if MGB_CUDA | |||
CudaEnv m_cuda_env; | |||
#endif | |||
#if MGB_ATLAS | |||
AtlasEnv m_atlas_env; | |||
#endif | |||
#if MGB_CAMBRICON | |||
CnrtEnv m_cnrt_env; | |||
#endif | |||
CpuEnv m_cpu_env; | |||
std::unique_ptr<UserDataContainer> m_user_data_container; | |||
@@ -139,6 +139,32 @@ public: | |||
CudaError(const std::string& msg); | |||
}; | |||
class AtlasError final: public SystemError { | |||
public: | |||
AtlasError(const std::string& msg); | |||
}; | |||
class CnrtError final : public SystemError { | |||
public: | |||
/*! | |||
* \brief get extra info for current cnrt status, to be appended in | |||
* error message | |||
*/ | |||
static std::string get_cnrt_extra_info(); | |||
CnrtError(const std::string& msg); | |||
}; | |||
class CndevError final : public SystemError { | |||
public: | |||
CndevError(const std::string& msg); | |||
}; | |||
class CnmlError final : public SystemError { | |||
public: | |||
CnmlError(const std::string& msg); | |||
}; | |||
class AssertionError final : public MegBrainError { | |||
public: | |||
@@ -40,6 +40,13 @@ TEST(TestCompNode, Parse) { | |||
ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23)); | |||
ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23)); | |||
ASSERT_EQ(L::parse("cambriconx"), make_lc(D::CAMBRICON, -1, 0)); | |||
ASSERT_EQ(L::parse("cambricon2"), make_lc(D::CAMBRICON, 2, 0)); | |||
ASSERT_EQ(L::parse("cambricon2:3"), make_lc(D::CAMBRICON, 2, 3)); | |||
ASSERT_EQ(L::parse("atlasx"), make_lc(D::ATLAS, -1, 0)); | |||
ASSERT_EQ(L::parse("atlas2"), make_lc(D::ATLAS, 2, 0)); | |||
ASSERT_EQ(L::parse("atlas2:3"), make_lc(D::ATLAS, 2, 3)); | |||
ASSERT_EQ(L::parse("xpu"), make_lc(D::UNSPEC, -1, 0)); | |||
ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0)); | |||
ASSERT_EQ(L::parse("xpu23"), make_lc(D::UNSPEC, 23, 0)); | |||
@@ -59,6 +66,8 @@ TEST(TestCompNode, Parse) { | |||
ASSERT_THROW(L::parse("cpu0:"), MegBrainError); | |||
ASSERT_THROW(L::parse("cpu0:x"), MegBrainError); | |||
ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError); | |||
ASSERT_THROW(L::parse("cmabricon0"), MegBrainError); | |||
ASSERT_THROW(L::parse("atlast0"), MegBrainError); | |||
ASSERT_THROW(L::parse("multithread"), MegBrainError); | |||
ASSERT_THROW(L::parse("multithread1:"), MegBrainError); | |||
ASSERT_THROW(L::parse("multithread1:default"), MegBrainError); | |||
@@ -129,13 +138,20 @@ TEST(TestCompNode, Load) { | |||
ASSERT_EQ(CompNode::load("gpux"), cn2); | |||
ASSERT_EQ(CompNode::load("gpu1"), cn3); | |||
} | |||
#if MGB_ATLAS | |||
auto atlas0 = CompNode::load("atlas0"); | |||
auto atlas1 = CompNode::load("atlas1"); | |||
ASSERT_NE(atlas0, atlas1); | |||
#endif | |||
} | |||
TEST(TestCompNode, FreeAfterFinalize) { | |||
CompNode::finalize(); | |||
for (size_t i = 0; i < CompNode::NR_DEVICE_TYPE; ++i) { | |||
auto type = static_cast<CompNode::DeviceType>(i); | |||
if (!CompNode::get_device_count(type)) | |||
if (!check_device_type_avaiable(type) || | |||
!CompNode::get_device_count(type)) | |||
continue; | |||
auto cn = CompNode::load(CompNode::Locator{type, -1, {0}}); | |||
auto ptr = cn.alloc_device(123); | |||
@@ -275,6 +291,30 @@ TEST(TestCompNodeCuda, Uid) { | |||
} | |||
#if MGB_CAMBRICON | |||
TEST(TestCompNodeCambricon, MemNode) { | |||
REQUIRE_CAMBRICON_DEVICE(2); | |||
auto cn00 = CompNode::load("cambricon0"), | |||
cn1 = CompNode::load("cambricon1"), | |||
cn01 = CompNode::load("cambricon0:1"); | |||
ASSERT_EQ(cn00, CompNode::load("cambricon0")); | |||
ASSERT_EQ(cn00.mem_node(), cn01.mem_node()); | |||
ASSERT_NE(cn00.mem_node(), cn1.mem_node()); | |||
} | |||
#endif | |||
#if MGB_ATLAS | |||
TEST(TestCompNodeAtlas, MemNode) { | |||
auto cn00 = CompNode::load("atlas0"), | |||
cn1 = CompNode::load("atlas1"), | |||
cn01 = CompNode::load("atlas0:1"); | |||
ASSERT_EQ(cn00, CompNode::load("atlas0")); | |||
ASSERT_EQ(cn00.mem_node(), cn01.mem_node()); | |||
ASSERT_NE(cn00.mem_node(), cn1.mem_node()); | |||
} | |||
#endif | |||
TEST(TestCompNodeCPU, PhysicalDispatch) { | |||
constexpr int ID = 0x2a6453e0; | |||
using L = CompNode::Locator; | |||
@@ -421,6 +461,41 @@ TEST(TestCompNodeCPU, PeerCopyFromCUDA) { | |||
} | |||
#if MGB_CAMBRICON | |||
TEST(TestCompNodeCPU, PeerCopyFromCambricon) { | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
REQUIRE_THREAD(); | |||
auto cn_gpu = CompNode::load("cambriconx"); | |||
auto cn_cpu = CompNode::load("cpux"); | |||
HostTensorGenerator<> gen; | |||
auto a = gen({20, 3, 112, 112}); | |||
auto b = gen({20, 3, 112, 112}); | |||
auto c = gen({20, 3, 112, 112}); | |||
DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu}; | |||
dev_a.copy_from(*a).sync(); | |||
dev_b.copy_from(*b).sync(); | |||
dev_c.copy_from(*c).sync(); | |||
auto wait_event = cn_gpu.create_event(); | |||
dev_a.copy_from(dev_c); | |||
wait_event->record(); | |||
cn_cpu.device_wait_event(*wait_event); | |||
dev_b.copy_from(dev_a); | |||
dev_b.sync(); | |||
HostTensorND result; | |||
result.copy_from(dev_b); | |||
CompNode::sync_all(); | |||
MGB_ASSERT_TENSOR_EQ(result, *c); | |||
} | |||
#endif | |||
TEST(TestCompNodeSyncManager, HostWait) { | |||
REQUIRE_THREAD(); | |||
CompNodeSyncManager mgr(CompNode::load("xpu0")); | |||
@@ -542,6 +617,8 @@ TEST(TestCompNode, MultipleLoad) { | |||
}; | |||
for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) { | |||
auto dt = static_cast<CompNode::DeviceType>(i); | |||
if (!check_device_type_avaiable(dt)) | |||
continue; | |||
if (CompNode::get_device_count(dt)) { | |||
auto cn = CompNode::load({dt, 0, {0}}); | |||
mgb_log("comp node %s is available", cn.to_string().c_str()); | |||
@@ -552,6 +629,111 @@ TEST(TestCompNode, MultipleLoad) { | |||
} | |||
} | |||
#if MGB_CAMBRICON | |||
TEST(TestCompNodeCambricon, D2DCopy) { | |||
auto run = [](CompNode cn) { | |||
constexpr size_t size = 100 * 1024 * 1024; | |||
HostTensorND a(cn, {size}, dtype::Int32{}), b; | |||
auto pa = a.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
pa[i] = i; | |||
} | |||
DeviceTensorND tmp, tmp1; | |||
tmp.copy_from(a); | |||
tmp1.copy_from(tmp); | |||
b.copy_from(tmp1).sync(); | |||
auto pb = b.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
ASSERT_EQ(static_cast<int>(i), pb[i]); | |||
} | |||
CompNode::finalize(); | |||
}; | |||
REQUIRE_CAMBRICON_DEVICE(1); | |||
auto cn = CompNode::load("cambricon0"); | |||
run(cn); | |||
cn = CompNode::load("cambricon1"); | |||
run(cn); | |||
} | |||
// peer copy for cambricon between different devices is not correct now, so | |||
// disable this testcase | |||
#if 0 | |||
TEST(TestCompNodeCambricon, P2PCopy) { | |||
auto run_raw = []() { | |||
int v0 = 0, v1 = 1; | |||
cnrtDev_t dev0, dev1; | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev0, 0)); | |||
MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev1, 1)); | |||
int *dp0, *dp1; | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev0)); | |||
MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp0), sizeof(int))); | |||
MGB_CNRT_CHECK( | |||
cnrtMemcpy(dp0, &v0, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV)); | |||
MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev1)); | |||
MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp1), sizeof(int))); | |||
MGB_CNRT_CHECK( | |||
cnrtMemcpy(dp1, &v1, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV)); | |||
unsigned int can = 0; | |||
MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, 0, 1)); | |||
printf("can = %s\n", can ? "TRUE" : "FALSE"); | |||
if (can) { | |||
MGB_CNRT_CHECK(cnrtMemcpyPeer(dp1, 1, dp0, 0, sizeof(int))); | |||
int get; | |||
MGB_CNRT_CHECK(cnrtMemcpy(&get, dp1, sizeof(int), | |||
CNRT_MEM_TRANS_DIR_DEV2HOST)); | |||
ASSERT_EQ(0, get); | |||
} | |||
}; | |||
auto run = [](CompNode cn0, CompNode cn1) { | |||
constexpr size_t size = 100; | |||
HostTensorND a(cn0, {size}, dtype::Int32{}), b; | |||
auto pa = a.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
pa[i] = i; | |||
} | |||
DeviceTensorND tmp(cn0, {size}, dtype::Int32{}), | |||
tmp1(cn1, {size}, dtype::Int32{}); | |||
tmp.copy_from(a); | |||
tmp1.copy_from(tmp); | |||
b.copy_from(tmp1).sync(); | |||
auto pb = b.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
ASSERT_EQ(static_cast<int>(i), pb[i]); | |||
} | |||
CompNode::finalize(); | |||
}; | |||
REQUIRE_CAMBRICON_DEVICE(2); | |||
auto cn0 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1"); | |||
run_raw(); | |||
run(cn0, cn1); | |||
} | |||
#endif | |||
#endif // MGB_CAMBRICON | |||
#if MGB_ATLAS | |||
TEST(TestCompNodeAtlas, D2DCopy) { | |||
auto run = [](CompNode cn) { | |||
constexpr size_t size = 10 * 1024 * 1024; | |||
HostTensorND a(cn, {size}, dtype::Int32{}), b; | |||
auto pa = a.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
pa[i] = i; | |||
} | |||
DeviceTensorND tmp, tmp1; | |||
tmp.copy_from(a); | |||
tmp1.copy_from(tmp); | |||
b.copy_from(tmp1).sync(); | |||
auto pb = b.ptr<int>(); | |||
for (size_t i = 0; i < size; ++i) { | |||
ASSERT_EQ(static_cast<int>(i), pb[i]); | |||
} | |||
CompNode::finalize(); | |||
}; | |||
auto cn = CompNode::load("atlas0"); | |||
run(cn); | |||
} | |||
#endif | |||
namespace { | |||
class CompNodeDepedentObjectInst final : public CompNodeDepedentObject { | |||
@@ -13,6 +13,8 @@ | |||
#define _HEADER_MGB_BUILD_CONFIG | |||
#cmakedefine01 MGB_CUDA | |||
#cmakedefine01 MGB_CAMBRICON | |||
#cmakedefine01 MGB_ATLAS | |||
#cmakedefine01 MGB_ASSERT_LOC | |||
#cmakedefine01 MGB_ENABLE_DEBUG_UTIL | |||
#cmakedefine01 MGB_ENABLE_LOGGING | |||
@@ -54,6 +56,10 @@ | |||
#cmakedefine01 MEGDNN_THREADS_512 | |||
#cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS | |||
// whether atlas is available | |||
#ifndef MGB_ATLAS | |||
#define MGB_ATLAS 0 | |||
#endif | |||
// whether cuda is available | |||
#ifndef MGB_CUDA | |||
@@ -135,6 +141,15 @@ | |||
#endif | |||
#ifndef MEGDNN_WITH_CAMBRICON | |||
#define MEGDNN_WITH_CAMBRICON 0 | |||
#endif | |||
#ifndef MGB_CAMBRICON | |||
#define MGB_CAMBRICON MEGDNN_WITH_CAMBRICON | |||
#endif | |||
// whether to enable TensorRT support | |||
#ifndef MGB_ENABLE_TENSOR_RT | |||
#define MGB_ENABLE_TENSOR_RT MGB_CUDA | |||
@@ -0,0 +1,486 @@ | |||
/** | |||
* \file src/opr/impl/atlas_runtime_op.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megbrain/opr/atlas_runtime_op.h" | |||
#include <memory> | |||
#include "megbrain/common.h" | |||
#include "megbrain/graph/operator_node.h" | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/dtype.h" | |||
#if MGB_ATLAS | |||
#include "acl/acl_mdl.h" | |||
using namespace mgb; | |||
using namespace opr; | |||
namespace { | |||
/** | |||
* \brief get mgb shape from acl shape, batch from mgb | |||
*/ | |||
TensorShape acl_shape_to_mgb_shape_for_output(aclmdlIODims acl_shape, | |||
size_t batch) { | |||
TensorShape ret; | |||
ret.ndim = acl_shape.dimCount; | |||
for (size_t i = 0; i < ret.ndim; ++i) { | |||
ret[i] = acl_shape.dims[i]; | |||
} | |||
ret[0] = batch; | |||
return ret; | |||
} | |||
/** | |||
* \brief deduce the input shape from aclFormat and aipp config. | |||
* | |||
* \param acl_shape shape from om file | |||
* \param batch batchsize from mgb | |||
* \param enable_dynamic_batch True if set dynamic batch size | |||
* \param om_format layout format from om file | |||
* \param aipp_input_fmt input_format in static aipp config of om file | |||
*/ | |||
TensorShape acl_shape_to_mgb_shape_for_input( | |||
aclmdlIODims acl_shape, size_t batch, bool enable_dynamic_batch, | |||
aclFormat om_format, AtlasRuntimeOpr::AippInputFormat aipp_input_fmt) { | |||
TensorShape ret; | |||
ret.ndim = acl_shape.dimCount; | |||
mgb_assert(ret.ndim == 4, | |||
"Unexpected ndim form aclmdlIODims expected 4, but got %zu", | |||
ret.ndim); | |||
for (size_t i = 0; i < ret.ndim; ++i) { | |||
ret[i] = acl_shape.dims[i]; | |||
} | |||
if (enable_dynamic_batch) { | |||
mgb_assert(ret[0] == static_cast<size_t>(-1), | |||
"batch size expected to be -1 when enable dynamic " | |||
"batchsize, got: %zu\n", | |||
ret[0]); | |||
ret[0] = batch; | |||
} else { | |||
mgb_assert(ret[0] == batch, | |||
"batchsize mismatch if no dynamic batchsize enabled, " | |||
"expected: %zu got: %zu\n", | |||
ret[0], batch); | |||
} | |||
if (aipp_input_fmt != AtlasRuntimeOpr::AippInputFormat::NO_AIPP) { | |||
mgb_assert(om_format == ACL_FORMAT_NHWC, | |||
"om format should be NHWC if enable aipp"); | |||
} | |||
return ret; | |||
} | |||
DType acl_dtype_to_mgb_dtype(aclDataType data_type) { | |||
switch (data_type) { | |||
case ACL_UINT8: | |||
return dtype::Uint8(); | |||
case ACL_FLOAT16: | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
return dtype::Float16(); | |||
#else | |||
mgb_throw(MegBrainError, | |||
"Float16 support is disabled at compile time."); | |||
#endif | |||
case ACL_FLOAT: | |||
return dtype::Float32(); | |||
case ACL_INT8: | |||
return dtype::Int8(); | |||
case ACL_INT16: | |||
return dtype::Int16(); | |||
case ACL_INT32: | |||
return dtype::Int32(); | |||
default: | |||
mgb_throw(MegBrainError, | |||
"aclDataType %x is not supported by MegBrain.", | |||
static_cast<int>(data_type)); | |||
} | |||
} | |||
/** | |||
* \brief generate batch size which match the batch_choice | |||
*/ | |||
SmallVector<size_t> gen_batch_vec(size_t origin_batch, | |||
const SmallVector<size_t>& batch_choices) { | |||
SmallVector<size_t> ret; | |||
size_t idx = 0; | |||
size_t nr_batch_choices = batch_choices.size(); | |||
size_t batch = origin_batch; | |||
while (idx < nr_batch_choices) { | |||
size_t val = batch_choices[idx]; | |||
while (batch >= batch_choices[idx]) { | |||
ret.push_back(val); | |||
batch -= val; | |||
} | |||
idx++; | |||
} | |||
mgb_assert(batch == 0, | |||
"Invalid batch size %zu, can not be generate by batch choices", | |||
origin_batch); | |||
return ret; | |||
} | |||
class PtrGetter { | |||
public: | |||
PtrGetter(const VarNodeArray& vars) { | |||
for (auto&& var : vars) { | |||
m_ptrs.push_back(var->dev_tensor().raw_ptr()); | |||
m_batch_in_bytes.push_back(var->layout().stride[0] * | |||
var->layout().dtype.size()); | |||
} | |||
} | |||
std::pair<void*, size_t> get(size_t batch, size_t idx) { | |||
std::pair<void*, size_t> ret; | |||
ret.first = m_ptrs[idx]; | |||
ret.second = batch * m_batch_in_bytes[idx]; | |||
m_ptrs[idx] = reinterpret_cast<void*>( | |||
reinterpret_cast<uintptr_t>(ret.first) + ret.second); | |||
return ret; | |||
} | |||
private: | |||
SmallVector<void*> m_ptrs; | |||
SmallVector<size_t> m_batch_in_bytes; | |||
}; | |||
}; // namespace | |||
/* ====================== AtlasRuntimeOpr ==================== */ | |||
MGB_DYN_TYPE_OBJ_FINAL_IMPL(AtlasRuntimeOpr); | |||
AtlasRuntimeOpr::AtlasRuntimeOpr(SharedBuffer buf, | |||
const std::pair<uint32_t, aclmdlDesc*>& model, | |||
const VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) | |||
: Super(inputs[0]->owner_graph(), config, "atlas_runtime", inputs), | |||
m_buffer{std::move(buf)}, | |||
m_model_id{model.first}, | |||
m_model_desc{model.second} { | |||
mgb_assert( | |||
inputs[0]->comp_node().device_type() == CompNode::DeviceType::ATLAS, | |||
"AtlasRuntimeOpr can only be used on atlas comp node; " | |||
"got %s", | |||
inputs[0]->comp_node().to_string().c_str()); | |||
mgb_assert(m_buffer.data() != nullptr || | |||
(m_model_id != INVALID_MODEL_ID && m_model_desc != nullptr)); | |||
for (auto i : inputs) { | |||
add_input({i}); | |||
} | |||
if (m_model_id == INVALID_MODEL_ID && m_model_desc == nullptr) { | |||
MGB_ATLAS_CHECK(aclmdlLoadFromMem(m_buffer.data(), m_buffer.size(), | |||
&m_model_id)); | |||
m_model_desc = aclmdlCreateDesc(); | |||
MGB_ATLAS_CHECK(aclmdlGetDesc(m_model_desc, m_model_id)); | |||
m_is_model_holder = true; | |||
} | |||
//! aipp input format | |||
m_aipp_input_format = SmallVector<AippInputFormat>(inputs.size()); | |||
aclAippInfo aipp_info; | |||
for (size_t i = 0; i < inputs.size(); ++i) { | |||
aclError acl_err = aclmdlGetFirstAippInfo(m_model_id, i, &aipp_info); | |||
if (ACL_ERROR_NONE == acl_err) { | |||
switch (aipp_info.inputFormat) { | |||
case ACL_YUV420SP_U8: | |||
m_aipp_input_format[i] = AippInputFormat::YUV420SP_U8; | |||
break; | |||
case ACL_RGB888_U8: | |||
m_aipp_input_format[i] = AippInputFormat::RGB888_U8; | |||
break; | |||
default: | |||
mgb_throw(MegBrainError, | |||
"Unsupported aclAippInputFormat for input %zu. ", | |||
i); | |||
} | |||
} else if (ACL_ERROR_NOT_STATIC_AIPP == acl_err) { | |||
m_aipp_input_format[i] = AippInputFormat::NO_AIPP; | |||
} else { | |||
MGB_ATLAS_CHECK(acl_err); | |||
} | |||
} | |||
size_t dynamic_index; | |||
auto errcode = aclmdlGetInputIndexByName( | |||
m_model_desc, ACL_DYNAMIC_TENSOR_NAME, &dynamic_index); | |||
if (errcode == ACL_ERROR_NONE) { | |||
aclmdlHW hw_info; | |||
MGB_ATLAS_CHECK( | |||
aclmdlGetDynamicHW(m_model_desc, dynamic_index, &hw_info)); | |||
mgb_assert(hw_info.hwCount == 0, "Currently not support dynamic HW"); | |||
} | |||
//! dynamic batch size | |||
aclmdlBatch acl_batch; | |||
MGB_ATLAS_CHECK(aclmdlGetDynamicBatch(m_model_desc, &acl_batch)); | |||
if (acl_batch.batchCount) { | |||
size_t dynamic_data_size; | |||
dynamic_data_size = | |||
aclmdlGetInputSizeByIndex(m_model_desc, dynamic_index); | |||
m_dyn_batch_tensor = DeviceTensorND( | |||
inputs[0]->comp_node(), {{dynamic_data_size}, dtype::Uint8()}); | |||
for (size_t i = 0; i < acl_batch.batchCount; ++i) { | |||
m_dyn_batch_choices.push_back( | |||
static_cast<size_t>(acl_batch.batch[i])); | |||
} | |||
std::sort(m_dyn_batch_choices.begin(), m_dyn_batch_choices.end(), | |||
std::greater<>()); | |||
} | |||
//! add output | |||
size_t nr_outputs = aclmdlGetNumOutputs(m_model_desc); | |||
using F = VarNode::Flag; | |||
if (nr_outputs == 1) { | |||
add_output(None); | |||
} else { | |||
for (size_t i = 0; i < nr_outputs; ++i) { | |||
add_output(ssprintf("o%zu", i)); | |||
} | |||
} | |||
if (!m_dyn_batch_choices.empty()) { | |||
/** | |||
* \warning If enable dynamic batchsize, the memory of output | |||
* should be the largest be the size with the largest batch_size, so we | |||
* set the flag to SYS_MEM_ALLOC. | |||
*/ | |||
for (size_t i = 0; i < nr_outputs; ++i) { | |||
output(i)->add_flag(F::NO_SYS_MEM_ALLOC); | |||
} | |||
} | |||
add_equivalence_component<mgb::ScalarHash<const void*>>(m_buffer.data()); | |||
}; | |||
AtlasRuntimeOpr::~AtlasRuntimeOpr() { | |||
if (m_is_model_holder) { | |||
MGB_ATLAS_CHECK(aclmdlUnload(m_model_id)); | |||
MGB_ATLAS_CHECK(aclmdlDestroyDesc(m_model_desc)); | |||
} | |||
} | |||
void AtlasRuntimeOpr::scn_do_execute() { | |||
auto&& acl_env = | |||
CompNodeEnv::from_comp_node(input(0)->comp_node()).atlas_env(); | |||
acl_env.activate(); | |||
if (!m_dyn_batch_choices.empty()) { | |||
for (size_t i = 0; i < output().size(); i++) { | |||
auto output_size = aclmdlGetOutputSizeByIndex(m_model_desc, i); | |||
auto ovar = output(i); | |||
ovar->shape_alloc(ovar->shape(), output_size); | |||
} | |||
} | |||
PtrGetter input_getter(input()); | |||
PtrGetter output_getter(output()); | |||
bool enable_dynamic_batch = !m_dyn_batch_choices.empty(); | |||
size_t nr_inputs = aclmdlGetNumInputs(m_model_desc); | |||
size_t nr_outputs = aclmdlGetNumOutputs(m_model_desc); | |||
size_t input_batch = input(0)->layout()[0]; | |||
if (enable_dynamic_batch) { | |||
mgb_assert(nr_inputs == input().size() + 1, | |||
"nr inputs got from om model should be one more than got " | |||
"from megbrain"); | |||
} | |||
SmallVector<size_t> batches_each_run; | |||
if (enable_dynamic_batch) { | |||
batches_each_run = gen_batch_vec(input_batch, m_dyn_batch_choices); | |||
} else { | |||
batches_each_run.push_back(input_batch); | |||
} | |||
for (auto&& batch : batches_each_run) { | |||
//! prepare input | |||
auto model_inputs = aclmdlCreateDataset(); | |||
mgb_assert(model_inputs != nullptr, | |||
"failed to create atlas input dataset."); | |||
for (size_t i = 0; i < input().size(); i++) { | |||
auto value_pair = input_getter.get(batch, i); | |||
auto input_size = aclmdlGetInputSizeByIndex(m_model_desc, i); | |||
if (enable_dynamic_batch) { | |||
mgb_assert(input_size == value_pair.second / batch * | |||
m_dyn_batch_choices[0], | |||
"input %zu size mismatch, expected: %zu got: %zu", i, | |||
input_size, | |||
value_pair.second / batch * | |||
m_dyn_batch_choices[0]); | |||
} | |||
aclDataBuffer* input_db = | |||
aclCreateDataBuffer(value_pair.first, value_pair.second); | |||
mgb_assert(input_db != nullptr, | |||
"failed to create atlas input data buffer for input " | |||
"%zu:%s.", | |||
i, input(i)->cname()); | |||
aclmdlAddDatasetBuffer(model_inputs, input_db); | |||
} | |||
//! append unit tensor for dynamic batch | |||
if (enable_dynamic_batch) { | |||
aclDataBuffer* input_db = aclCreateDataBuffer( | |||
reinterpret_cast<void*>(m_dyn_batch_tensor.raw_ptr()), | |||
m_dyn_batch_tensor.layout().span().dist_byte()); | |||
mgb_assert(input_db != nullptr, | |||
"failed to create atlas input data buffer for dynamic " | |||
"batch tensor."); | |||
MGB_ATLAS_CHECK(aclmdlAddDatasetBuffer(model_inputs, input_db)); | |||
MGB_ATLAS_CHECK(aclmdlSetDynamicBatchSize( | |||
m_model_id, model_inputs, input().size(), | |||
static_cast<uint64_t>(batch))); | |||
} | |||
//! prepare output | |||
auto model_outputs = aclmdlCreateDataset(); | |||
mgb_assert(model_outputs != nullptr, | |||
"failed to create atlas output dataset."); | |||
for (size_t i = 0; i < nr_outputs; i++) { | |||
auto value_pair = output_getter.get(batch, i); | |||
aclDataBuffer* output_db = | |||
aclCreateDataBuffer(value_pair.first, value_pair.second); | |||
mgb_assert(output_db != nullptr, | |||
"failed to create atlas output data buffer for output " | |||
"%zu:%s.", | |||
i, output(i)->cname()); | |||
aclmdlAddDatasetBuffer(model_outputs, output_db); | |||
} | |||
MGB_ATLAS_CHECK(aclmdlExecute(m_model_id, model_inputs, model_outputs)); | |||
for (size_t i = 0; i < nr_inputs; ++i) { | |||
aclDataBuffer* db_ptr = aclmdlGetDatasetBuffer(model_inputs, i); | |||
MGB_ATLAS_CHECK(aclDestroyDataBuffer(db_ptr)); | |||
} | |||
for (size_t i = 0; i < nr_outputs; ++i) { | |||
aclDataBuffer* db_ptr = aclmdlGetDatasetBuffer(model_outputs, i); | |||
MGB_ATLAS_CHECK(aclDestroyDataBuffer(db_ptr)); | |||
} | |||
MGB_ATLAS_CHECK(aclmdlDestroyDataset(model_inputs)); | |||
MGB_ATLAS_CHECK(aclmdlDestroyDataset(model_outputs)); | |||
} | |||
} | |||
void AtlasRuntimeOpr::get_output_var_shape(const TensorShapeArray& inp_shape, | |||
TensorShapeArray& out_shape) const { | |||
size_t nr_inputs = aclmdlGetNumInputs(m_model_desc); | |||
size_t batch_size = inp_shape[0][0]; | |||
//! enable dynamic batchsize | |||
if (!m_dyn_batch_choices.empty()) { | |||
mgb_assert(!gen_batch_vec(batch_size, m_dyn_batch_choices).empty()); | |||
mgb_assert(nr_inputs == inp_shape.size() + 1, | |||
"nr inputs got from om model should be one more than got " | |||
"from megbrain"); | |||
} | |||
for (size_t i = 0; i < inp_shape.size(); ++i) { | |||
aclmdlIODims input_dims; | |||
MGB_ATLAS_CHECK(aclmdlGetInputDimsV2(m_model_desc, i, &input_dims)); | |||
auto om_format = aclmdlGetInputFormat(m_model_desc, i); | |||
TensorShape shape_from_om = acl_shape_to_mgb_shape_for_input( | |||
input_dims, batch_size, !m_dyn_batch_choices.empty(), om_format, | |||
m_aipp_input_format[i]); | |||
mgb_assert(shape_from_om.eq_shape(inp_shape[i]), | |||
"shape mismatch of input %zu, expected: %s got: %s", i, | |||
shape_from_om.to_string().c_str(), | |||
inp_shape[i].to_string().c_str()); | |||
} | |||
for (size_t i = 0; i < out_shape.size(); ++i) { | |||
aclmdlIODims output_dims; | |||
MGB_ATLAS_CHECK(aclmdlGetOutputDims(m_model_desc, i, &output_dims)); | |||
out_shape[i] = | |||
acl_shape_to_mgb_shape_for_output(output_dims, batch_size); | |||
} | |||
} | |||
void AtlasRuntimeOpr::add_input_layout_constraint() { | |||
//! default contiguous | |||
for (auto i : input()) { | |||
i->add_layout_constraint_contiguous(); | |||
} | |||
} | |||
void AtlasRuntimeOpr::init_output_dtype() { | |||
DType dt_acl, dt_input; | |||
for (size_t i = 0; i < input().size(); ++i) { | |||
dt_acl = | |||
acl_dtype_to_mgb_dtype(aclmdlGetInputDataType(m_model_desc, i)); | |||
dt_input = input(i)->dtype(); | |||
mgb_assert(dt_acl.valid() && dt_input.valid() && | |||
dt_acl.enumv() == dt_input.enumv(), | |||
"dtype mismatch of input %zu: expected %s, " | |||
"got %s", | |||
i, dt_acl.name(), dt_input.name()); | |||
} | |||
for (size_t i = 0; i < output().size(); ++i) { | |||
dt_acl = acl_dtype_to_mgb_dtype( | |||
aclmdlGetOutputDataType(m_model_desc, i)); | |||
mgb_assert(dt_acl.valid(), | |||
"output dtype checking failed: invalid dtype returned."); | |||
if (dt_acl.enumv() == DTypeEnum::QuantizedS8) { | |||
mgb_assert(output(i)->dtype().valid(), | |||
"user should specify scale of output tensor of " | |||
"AtlasRuntimeOpr."); | |||
} | |||
if (!output(i)->dtype().valid()) | |||
output(i)->dtype(dt_acl); | |||
} | |||
} | |||
SymbolVarArray AtlasRuntimeOpr::make(SharedBuffer buf, | |||
const SymbolVarArray& src, | |||
const OperatorNodeConfig& config) { | |||
VarNodeArray var_node_array = cg::to_var_node_array(src); | |||
auto atlas_runtime_opr = std::make_unique<AtlasRuntimeOpr>( | |||
std::move(buf), | |||
std::pair<uint32_t, aclmdlDesc*>{INVALID_MODEL_ID, nullptr}, | |||
var_node_array, config); | |||
auto ret = cg::to_symbol_var_array( | |||
src[0].node() | |||
->owner_graph() | |||
->insert_opr(std::move(atlas_runtime_opr)) | |||
->output()); | |||
return ret; | |||
} | |||
SymbolVarArray AtlasRuntimeOpr::make(const void* buf, size_t size, | |||
const SymbolVarArray& src, | |||
const OperatorNodeConfig& config) { | |||
mgb_throw_if(!CompNode::get_device_count(CompNode::DeviceType::ATLAS), | |||
SystemError, | |||
"can not create AtlasRuntimeOpr when atlas is not " | |||
"available"); | |||
std::shared_ptr<uint8_t> shptr{new uint8_t[size], | |||
[](uint8_t* p) { delete[] p; }}; | |||
memcpy(shptr.get(), buf, size); | |||
SharedBuffer buffer{std::move(shptr), size}; | |||
return make(std::move(buffer), src, config); | |||
} | |||
SymbolVarArray AtlasRuntimeOpr::make( | |||
const SharedBuffer buf, const std::pair<uint32_t, aclmdlDesc*>& model, | |||
const SymbolVarArray& src, const OperatorNodeConfig& config) { | |||
VarNodeArray var_node_array = cg::to_var_node_array(src); | |||
auto atlas_runtime_opr = std::make_unique<AtlasRuntimeOpr>( | |||
buf, model, var_node_array, config); | |||
auto ret = cg::to_symbol_var_array( | |||
src[0].node() | |||
->owner_graph() | |||
->insert_opr(std::move(atlas_runtime_opr)) | |||
->output()); | |||
return ret; | |||
} | |||
constexpr uint32_t AtlasRuntimeOpr::INVALID_MODEL_ID; | |||
#endif // MGB_atlas | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,17 @@ | |||
decl_raw_opr( | |||
'atlas_runtime', | |||
desc='create an operator that could load and run acl offline model', | |||
inputs=[ | |||
Doc('inputs', 'input vars', 'list of :class:`.SymbolVar`'), | |||
Doc('data_bytes', 'serialized acl model'), | |||
], | |||
body=[ | |||
'assert isinstance(data_bytes, bytes), ' | |||
'"data must be bytes; got {}".format(type(data_bytes))', | |||
'output = _mgb._Opr.atlas_runtime(inputs, data_bytes, config)', | |||
'cvt_result_kwargs["explode_single"] = False', | |||
], | |||
) | |||
# vim: ft=python | |||
@@ -0,0 +1,68 @@ | |||
/** | |||
* \file src/opr/impl/atlas_runtime_opr.sereg.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megbrain/opr/atlas_runtime_op.h" | |||
#include "megbrain/serialization/sereg.h" | |||
#if MGB_ATLAS | |||
namespace mgb { | |||
namespace serialization { | |||
template <> | |||
struct OprLoadDumpImpl<opr::AtlasRuntimeOpr, 0> { | |||
static void dump(OprDumpContext& ctx, const cg::OperatorNodeBase& opr_) { | |||
auto&& opr = opr_.cast_final_safe<opr::AtlasRuntimeOpr>(); | |||
auto&& buf = opr.buffer(); | |||
ctx.dump_buf_with_len(buf.data(), buf.size()); | |||
} | |||
static cg::OperatorNodeBase* load(OprLoadContext& ctx, | |||
const cg::VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) { | |||
inputs.at(0)->comp_node().activate(); | |||
auto buf = ctx.load_shared_buf_with_len(); | |||
return opr::AtlasRuntimeOpr::make( | |||
std::move(buf), cg::to_symbol_var_array(inputs), | |||
config) | |||
.at(0) | |||
.node() | |||
->owner_opr(); | |||
} | |||
}; | |||
} // namespace serialization | |||
namespace opr { | |||
cg::OperatorNodeBase* opr_shallow_copy_atlas_runtime_opr( | |||
const serialization::OprShallowCopyContext& ctx, | |||
const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) { | |||
MGB_MARK_USED_VAR(ctx); | |||
auto&& opr = opr_.cast_final_safe<AtlasRuntimeOpr>(); | |||
return AtlasRuntimeOpr::make(opr.buffer(), opr.model(), | |||
cg::to_symbol_var_array(inputs), | |||
config) | |||
.at(0) | |||
.node() | |||
->owner_opr(); | |||
} | |||
MGB_SEREG_OPR(AtlasRuntimeOpr, 0); | |||
MGB_REG_OPR_SHALLOW_COPY(AtlasRuntimeOpr, opr_shallow_copy_atlas_runtime_opr); | |||
} // namespace opr | |||
} // namespace mgb | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,87 @@ | |||
/** | |||
* \file src/opr/include/megbrain/opr/atlas_runtime_op.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include <memory> | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/graph.h" | |||
#include "megbrain/serialization/file.h" | |||
#if MGB_ATLAS | |||
#include "acl/acl.h" | |||
namespace mgb { | |||
namespace opr { | |||
MGB_DEFINE_OPR_CLASS(AtlasRuntimeOpr, | |||
cg::SingleCNOutshapePureByInshapeOprBase) // { | |||
public: | |||
using SharedBuffer = mgb::serialization::SharedBuffer; | |||
enum AippInputFormat {NO_AIPP, YUV420SP_U8, RGB888_U8}; | |||
void scn_do_execute() override; | |||
void get_output_var_shape(const TensorShapeArray& inp_shape, | |||
TensorShapeArray& out_shape) const override; | |||
void add_input_layout_constraint() override; | |||
void init_output_dtype() override; | |||
/** | |||
* \brief create AtlasRuntimeOpr with buf or set model with | |||
* a existance model. | |||
* | |||
* \brief Neither buf is set or model_id&model_desc is set | |||
*/ | |||
AtlasRuntimeOpr(SharedBuffer buf, | |||
const std::pair<uint32_t, aclmdlDesc*>& model, | |||
const VarNodeArray& inputs, | |||
const OperatorNodeConfig& config); | |||
~AtlasRuntimeOpr(); | |||
const SharedBuffer& buffer() const { return m_buffer; } | |||
std::pair<uint32_t, aclmdlDesc*> model() const { | |||
return {m_model_id, m_model_desc}; | |||
} | |||
static SymbolVarArray make(SharedBuffer buf, const SymbolVarArray& src, | |||
const OperatorNodeConfig& config = {}); | |||
static SymbolVarArray make(SharedBuffer buf, | |||
const std::pair<uint32_t, aclmdlDesc*>& model, | |||
const SymbolVarArray& src, | |||
const OperatorNodeConfig& config = {}); | |||
static SymbolVarArray make(const void* buf, size_t size, | |||
const SymbolVarArray& src, | |||
const OperatorNodeConfig& config = {}); | |||
private: | |||
SharedBuffer m_buffer; | |||
constexpr static uint32_t INVALID_MODEL_ID = -1; | |||
uint32_t m_model_id = INVALID_MODEL_ID; | |||
aclmdlDesc* m_model_desc = nullptr; | |||
//! if set true, it will release model | |||
bool m_is_model_holder = false; | |||
SmallVector<AippInputFormat> m_aipp_input_format; | |||
//! Atlas need a 64bit device tensor to hold dynamic batch state | |||
DeviceTensorND m_dyn_batch_tensor; | |||
SmallVector<size_t> m_dyn_batch_choices; | |||
}; | |||
} // namespace opr | |||
} // namespace mgb | |||
#endif // MGB_ATLAS | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,202 @@ | |||
/** | |||
* \file src/opr/test/atlas_runtime_op.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megdnn/dtype.h" | |||
#if MGB_ATLAS | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/opr/basic_arith.h" | |||
#include "megbrain/test/helper.h" | |||
#include "megbrain/opr/atlas_runtime_op.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/plugin/profiler.h" | |||
#include <random> | |||
#include <vector> | |||
#include <stdio.h> | |||
#include "./atlas_models.h" | |||
using namespace mgb; | |||
using namespace opr; | |||
using namespace serialization; | |||
TEST(TestOprAtlas, Basic) { | |||
HostTensorGenerator<> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
const auto& host_x = gen({4, 3, 16, 16}); | |||
//! run om model | |||
const auto& om_buffer = ATLAS_MODEL.at("model_om"); | |||
auto cn = CompNode::load("atlas0"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x, cn); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{x})[0]; | |||
HostTensorND host_om; | |||
auto om_func = graph->compile({make_callback_copy(y, host_om, true)}); | |||
om_func->execute().wait(); | |||
//! run mdl model | |||
const auto& mdl_buffer = ATLAS_MODEL.at("model_mdl"); | |||
auto loader = GraphLoader::make( | |||
InputFile::make_mem_proxy(mdl_buffer.first, mdl_buffer.second)); | |||
auto rst = loader->load(); | |||
auto input = rst.tensor_map.at("d"); | |||
input->copy_from(*host_x).sync(); | |||
HostTensorND host_mdl; | |||
auto mgb_func = rst.graph_compile( | |||
{make_callback_copy(rst.output_var_list[0], host_mdl)}); | |||
mgb_func->execute().wait(); | |||
//! In atlas, the inner compute is fp16 | |||
MGB_ASSERT_TENSOR_NEAR(host_mdl, host_om, 1e-3); | |||
} | |||
TEST(TestOprAtlas, DynamicBatch) { | |||
for (size_t batch : {1, 6}) { | |||
HostTensorGenerator<> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
const auto& host_x = gen({batch, 3, 16, 16}); | |||
//! run om model | |||
const auto& om_buffer = ATLAS_MODEL.at("model_dyn_om"); | |||
auto cn = CompNode::load("atlas0"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x, cn); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{x})[0]; | |||
HostTensorND host_om; | |||
auto om_func = graph->compile({make_callback_copy(y, host_om, true)}); | |||
om_func->execute().wait(); | |||
//! run mdl model | |||
const auto& mdl_buffer = ATLAS_MODEL.at("model_mdl"); | |||
auto loader = GraphLoader::make( | |||
InputFile::make_mem_proxy(mdl_buffer.first, mdl_buffer.second)); | |||
auto rst = loader->load(); | |||
auto input = rst.tensor_map.at("d"); | |||
input->copy_from(*host_x).sync(); | |||
HostTensorND host_mdl; | |||
auto mgb_func = rst.graph_compile( | |||
{make_callback_copy(rst.output_var_list[0], host_mdl)}); | |||
mgb_func->execute().wait(); | |||
//! In atlas, the inner compute is fp16 | |||
MGB_ASSERT_TENSOR_NEAR(host_mdl, host_om, 1e-3); | |||
} | |||
} | |||
TEST(TestOprAtlas, Rgb888) { | |||
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
const auto &host_x = gen({1, 3, 16, 16}); | |||
//! run om model | |||
const auto& om_buffer = ATLAS_MODEL.at("model_rgb_om"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x); | |||
x = opr::Dimshuffle::make(x, {0, 2, 3, 1}); | |||
auto cn = CompNode::load("atlas0"); | |||
auto atlas_x = Copy::make(x, {cn}); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{atlas_x})[0]; | |||
HostTensorND host_om; | |||
auto om_func = graph->compile({make_callback_copy(y, host_om, true)}); | |||
om_func->execute().wait(); | |||
//! run mdl model | |||
const auto& mdl_buffer = ATLAS_MODEL.at("model_aipp_mdl"); | |||
auto loader = GraphLoader::make( | |||
InputFile::make_mem_proxy(mdl_buffer.first, mdl_buffer.second)); | |||
auto rst = loader->load(); | |||
auto input = rst.tensor_map.at("d"); | |||
input->copy_from(*host_x).sync(); | |||
HostTensorND host_mdl; | |||
auto mgb_func = rst.graph_compile( | |||
{make_callback_copy(rst.output_var_list[0], host_mdl)}); | |||
mgb_func->execute().wait(); | |||
//! In atlas, the inner compute is fp16 | |||
MGB_ASSERT_TENSOR_NEAR(host_mdl, | |||
host_om, 1e-3); | |||
} | |||
TEST(TestOprAtlas, Yuv) { | |||
//! As YUV420SP depends on the input processed by AIPP, so here we just | |||
//! check if the shape satisfy. | |||
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
const auto &host_x = gen({1, 24, 16, 1}); | |||
//! run om model | |||
const auto& om_buffer = ATLAS_MODEL.at("model_yuv_om"); | |||
auto cn = CompNode::load("atlas0"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x, cn); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{x})[0]; | |||
HostTensorND host_om; | |||
auto om_func = graph->compile({make_callback_copy(y, host_om, true)}); | |||
om_func->execute().wait(); | |||
} | |||
TEST(TestOprAtlas, Serialization) { | |||
using namespace serialization; | |||
HostTensorGenerator<> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
const auto& host_x = gen({4, 3, 16, 16}); | |||
const auto& om_buffer = ATLAS_MODEL.at("model_om"); | |||
auto cn = CompNode::load("atlas0"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x, cn); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{x})[0]; | |||
auto fname = output_file("AtlasRuntimeOprTest"); | |||
auto dump = [&]() { | |||
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str())); | |||
auto rst = dumper->dump({y}); | |||
ASSERT_EQ(rst.outputs.size(), 1u); | |||
}; | |||
auto load = [&]() { | |||
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str())); | |||
auto rst = loader->load(); | |||
ASSERT_EQ(rst.output_var_list.size(), 1u); | |||
}; | |||
dump(); | |||
load(); | |||
} | |||
TEST(TestOprAtlas, Profiling) { | |||
HostTensorGenerator<> gen; | |||
const auto& graph = ComputingGraph::make(); | |||
GraphProfiler profiler{graph.get()}; | |||
const auto& host_x = gen({1, 3, 16, 16}); | |||
//! run om model | |||
const auto& om_buffer = ATLAS_MODEL.at("model_dyn_om"); | |||
auto cn = CompNode::load("atlas0"); | |||
auto x = Host2DeviceCopy::make(*graph, host_x, cn); | |||
auto y = opr::AtlasRuntimeOpr::make(om_buffer.first, om_buffer.second, | |||
{x})[0]; | |||
HostTensorND host_om; | |||
auto om_func = graph->compile({make_callback_copy(y, host_om, true)}); | |||
om_func->execute().wait(); | |||
profiler.to_json_full(om_func.get()) | |||
->writeto_fpath(output_file("atlas_runtime_opr_profile.json")); | |||
} | |||
#endif // MGB_ATLAS | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -118,6 +118,8 @@ void run_test(const PluginMaker& plugin_maker, | |||
const ResultChecker& result_checker) { | |||
for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) { | |||
auto type = static_cast<CompNode::DeviceType>(i); | |||
if (!check_device_type_avaiable(type)) | |||
continue; | |||
if (CompNode::get_device_count(type)) { | |||
auto cn = CompNode::load({type, -1, 0}); | |||
if (cn.contain_flag(CompNode::Flag::SUPPORT_RECORDER)) { | |||
@@ -188,4 +190,3 @@ TEST(TestOprIODump, Binary) { | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -32,6 +32,12 @@ namespace mgb{void call_sereg(){}} | |||
#if MGB_ENABLE_TENSOR_RT | |||
#include "../../tensorrt/impl/tensorrt_opr.sereg.h" | |||
#endif | |||
#if MGB_ATLAS | |||
#include "../../opr/impl/atlas_runtime_op.sereg.h" | |||
#endif | |||
#if MGB_JIT | |||
#include "../../jit/impl/jit.sereg.h" | |||
#endif | |||
#if MGB_CAMBRICON | |||
#include "../../cambricon/impl/cambricon_runtime_opr.sereg.h" | |||
#endif |
@@ -310,6 +310,28 @@ bool mgb::check_gpu_available(size_t num) { | |||
} | |||
bool mgb::check_cambricon_device_available(size_t num) { | |||
if (CompNode::get_device_count(CompNode::DeviceType::CAMBRICON) < num) { | |||
mgb_log_warn("skip test case that requires %zu cambricon device(s)", | |||
num); | |||
return false; | |||
} | |||
return true; | |||
} | |||
bool mgb::check_device_type_avaiable(CompNode::DeviceType device_type) { | |||
switch (device_type) { | |||
case mgb::CompNode::DeviceType::CUDA: | |||
case mgb::CompNode::DeviceType::CPU: | |||
case mgb::CompNode::DeviceType::CAMBRICON: | |||
case mgb::CompNode::DeviceType::ATLAS: | |||
case mgb::CompNode::DeviceType::MULTITHREAD: | |||
return true; | |||
default: | |||
return false; | |||
} | |||
return false; | |||
} | |||
bool mgb::check_compute_capability(int major, int minor) { | |||
#if MGB_CUDA | |||
@@ -437,8 +437,6 @@ std::vector<CompNode> load_multiple_xpus(size_t num); | |||
//! check whether given number of GPUs is available | |||
bool check_gpu_available(size_t num); | |||
//! check whether given number of AMD GPUs is available | |||
bool check_amd_gpu_available(size_t num); | |||
//! check whether given number of cambricon devices is available | |||
bool check_cambricon_device_available(size_t num); | |||
@@ -446,6 +444,9 @@ bool check_cambricon_device_available(size_t num); | |||
//! check current capability >= major.minor | |||
bool check_compute_capability(int major, int minor); | |||
//! check compnode avaiable | |||
bool check_device_type_avaiable(CompNode::DeviceType device_type); | |||
//! hook persistent cache get calls during the lifetime | |||
class PersistentCacheHook { | |||
class HookedImpl; | |||
@@ -479,6 +480,12 @@ public: | |||
return; \ | |||
} while(0) | |||
//! skip a testcase if cambricon device not available | |||
#define REQUIRE_CAMBRICON_DEVICE(n) \ | |||
do { \ | |||
if (!check_cambricon_device_available(n)) \ | |||
return; \ | |||
} while (0) | |||
#if MGB_HAVE_THREAD | |||
#define REQUIRE_THREAD() | |||