diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9450588..1b6b588e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: uses: actions/checkout@v2 - name: Checkout submodules run: | + apt update&&apt install ninja-build ./third_party/prepare.sh ./third_party/install-mkl.sh - name: Build MegEngine @@ -57,6 +58,7 @@ jobs: uses: actions/checkout@v2 - name: Checkout submodules run: | + apt update&&apt install ninja-build ./third_party/prepare.sh ./third_party/install-mkl.sh - name: Build MegEngine diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dbd6d28..e025b9b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,14 @@ cmake_minimum_required(VERSION 3.15.2) -message(STATUS "CMAKE_GENERATOR: ${CMAKE_GENERATOR}" ) -if (NOT ${CMAKE_GENERATOR} STREQUAL "Ninja") - message(WARNING "CMAKE_GENERATOR NOT EQUAL Ninja, which we do not recommend") +message(STATUS "CMAKE_GENERATOR: ${CMAKE_GENERATOR}") +if(NOT ${CMAKE_GENERATOR} STREQUAL "Ninja") + message(WARNING "CMAKE_GENERATOR NOT EQUAL Ninja, which we do not recommend") endif() -include (cmake/FetchMegBrainVersion.cmake) -project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING}) +include(cmake/FetchMegBrainVersion.cmake) +project( + MegEngine + LANGUAGES C CXX + VERSION ${MGB_VER_STRING}) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -15,43 +18,55 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) -if(NOT MSVC AND NOT APPLE AND NOT WIN32) - set(CMAKE_CXX_ARCHIVE_CREATE " Dqc ") - set(CMAKE_CXX_ARCHIVE_APPEND " Dq ") - set(CMAKE_CXX_ARCHIVE_FINISH " -D ") +if(NOT MSVC + AND NOT APPLE + AND NOT WIN32) + set(CMAKE_CXX_ARCHIVE_CREATE " Dqc ") + set(CMAKE_CXX_ARCHIVE_APPEND " Dq ") + set(CMAKE_CXX_ARCHIVE_FINISH " -D ") endif() include(GNUInstallDirs) include(CheckCXXCompilerFlag) include(CheckIPOSupported) -CHECK_CXX_COMPILER_FLAG(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) - -set(MGE_ARCH AUTO CACHE STRING "Architecture on which MegEngine to be built.") -set_property(CACHE MGE_ARCH PROPERTY STRINGS AUTO - x86_64 i386 - armv7 aarch64 - naive fallback -) -set (MGE_EXPORT_TARGETS MegEngine-targets) +check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) + +set(MGE_ARCH + AUTO + CACHE STRING "Architecture on which MegEngine to be built.") +set_property( + CACHE MGE_ARCH + PROPERTY STRINGS + AUTO + x86_64 + i386 + armv7 + aarch64 + naive + fallback) +set(MGE_EXPORT_TARGETS MegEngine-targets) if(NOT "$ENV{LD_LIBRARY_PATH}" STREQUAL "") - string(REPLACE ":" ";" ALTER_LD_LIBRARY_PATHS $ENV{LD_LIBRARY_PATH}) + string(REPLACE ":" ";" ALTER_LD_LIBRARY_PATHS $ENV{LD_LIBRARY_PATH}) else() - set(ALTER_LD_LIBRARY_PATHS "") + set(ALTER_LD_LIBRARY_PATHS "") endif() if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "") - string(REPLACE ":" ";" ALTER_LIBRARY_PATHS $ENV{LIBRARY_PATH}) + string(REPLACE ":" ";" ALTER_LIBRARY_PATHS $ENV{LIBRARY_PATH}) else() - set(ALTER_LIBRARY_PATHS "") + set(ALTER_LIBRARY_PATHS "") endif() option(MGE_WITH_JIT "Build MegEngine with JIT." ON) option(MGE_WITH_JIT_MLIR "Build MegEngine with MLIR JIT." OFF) option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" OFF) option(MGE_WITH_MIDOUT_PROFILE "Build MegEngine with Midout profile." OFF) -option(MGE_WITH_MINIMUM_SIZE "Swith off MGE_ENABLE_RTTI、MGE_ENABLE_EXCEPTIONS、MGE_ENABLE_LOGGING and switch on MGE_INFERENCE_ONLY so that compile minimum load_and_run." OFF) +option( + MGE_WITH_MINIMUM_SIZE + "Swith off MGE_ENABLE_RTTI、MGE_ENABLE_EXCEPTIONS、MGE_ENABLE_LOGGING and switch on MGE_INFERENCE_ONLY so that compile minimum load_and_run." + OFF) option(MGE_ARMV8_2_FEATURE_FP16 "Enable armv8.2-a+fp16 support" OFF) option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) @@ -81,781 +96,906 @@ option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) if(MSVC OR WIN32) - # FIXME: static link Windows vc runtime with some version from Visual Studio have - # some runtime issue at some call PATH, for example: _imperative_rt.pyd --> megengine_shared.dll - # for example c api flush can not find the fd args, I have no idea about this issue - # as a Workround, dynamic link vc runtime, but at some case, we will static link vcrt - # when MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP/MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2, so please - # use lite_static_all_in_one(lite/CMakeLists.txt) in Windows XP env as possible - # How to install VC runtime if you env do not install, refer to: - # https://docs.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-160 - option(MGE_STATIC_LINK_WITH_VC_RUNTIME "Enable mge static link with Windows vc runtime" OFF) - - option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP "Enable deploy inference on Windows xp" OFF) - # special MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 for Windows XP sp2(32bit) - # internal behavior: - # 1: will force define MGB_HAVE_THREAD=0, which means only support single thread - # 2: some Feature will be disable, eg: MGB_ENABLE_JSON and var sanity check, do - # not too many care this!!, if you want to use this Feature to 'DEBUG', you can - # run same model at NON-XP-SP2 env, eg Win7 or XP-SP3(build without MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) - # 3: we only support MegEngine(load_and_run) and MegEngineLite API work on XP SP2 - # some debug utils, eg, megbrain_test/megdnn_test not support run, most caused by gtest src code - # sdk caller: - # 1: as we remove mutex, when you use MSVC self API eg CreateThread to start several MegEngine instances - # in the same progress, please call MegEngine API(init/run) as serial as possible, also please - # do not use std::thread std::mutex/std::this_thread_id at SDK caller side!!! - # check dll/exe can deploy on Windows XP sp2 or not: - # please checkout scripts/misc/check_windows_xp_sp2_deploy.py - option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 "Enable deploy inference on Windows xp sp2" OFF) - - # PE file linked by LLVM lld can not run at Windows XP env, so we force use link.exe - # which always locate in Microsoft Visual Studio/*/*/VC/Tools/MSVC/*/bin/*/*/link.exe - set(CMAKE_LINKER "link.exe") - if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP OR MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) - set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON) - message(STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows XP") - - if(NOT ${MGE_ARCH} STREQUAL "i386") - message(FATAL_ERROR "only support 32bit when build for Windows xp") - endif() + # FIXME: static link Windows vc runtime with some version from Visual Studio have some + # runtime issue at some call PATH, for example: _imperative_rt.pyd --> + # megengine_shared.dll for example c api flush can not find the fd args, I have no + # idea about this issue as a Workround, dynamic link vc runtime, but at some case, we + # will static link vcrt when + # MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP/MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2, so please + # use lite_static_all_in_one(lite/CMakeLists.txt) in Windows XP env as possible How to + # install VC runtime if you env do not install, refer to: + # https://docs.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-160 + option(MGE_STATIC_LINK_WITH_VC_RUNTIME + "Enable mge static link with Windows vc runtime" OFF) + + option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP "Enable deploy inference on Windows xp" OFF) + # special MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 for Windows XP sp2(32bit) internal + # behavior: 1: will force define MGB_HAVE_THREAD=0, which means only support single + # thread 2: some Feature will be disable, eg: MGB_ENABLE_JSON and var sanity check, do + # not too many care this!!, if you want to use this Feature to 'DEBUG', you can run + # same model at NON-XP-SP2 env, eg Win7 or XP-SP3(build without + # MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) 3: we only support MegEngine(load_and_run) + # and MegEngineLite API work on XP SP2 some debug utils, eg, megbrain_test/megdnn_test + # not support run, most caused by gtest src code sdk caller: 1: as we remove mutex, + # when you use MSVC self API eg CreateThread to start several MegEngine instances in + # the same progress, please call MegEngine API(init/run) as serial as possible, also + # please do not use std::thread std::mutex/std::this_thread_id at SDK caller side!!! + # check dll/exe can deploy on Windows XP sp2 or not: please checkout + # scripts/misc/check_windows_xp_sp2_deploy.py + option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 + "Enable deploy inference on Windows xp sp2" OFF) + + # PE file linked by LLVM lld can not run at Windows XP env, so we force use link.exe + # which always locate in Microsoft Visual Studio/*/*/VC/Tools/MSVC/*/bin/*/*/link.exe + set(CMAKE_LINKER "link.exe") + if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP OR MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) + set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON) + message( + STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows XP") + + if(NOT ${MGE_ARCH} STREQUAL "i386") + message(FATAL_ERROR "only support 32bit when build for Windows xp") + endif() - if(NOT MGE_INFERENCE_ONLY) - message(FATAL_ERROR "only support inference when build for Windows xp") - endif() + if(NOT MGE_INFERENCE_ONLY) + message(FATAL_ERROR "only support inference when build for Windows xp") + endif() - if(MGE_WITH_CUDA) - message(FATAL_ERROR "do not support CUDA when build for Windows xp") - endif() + if(MGE_WITH_CUDA) + message(FATAL_ERROR "do not support CUDA when build for Windows xp") + endif() - # Windows XP sp3 have thread issue, Workround for it - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-") - # for Windows XP type - add_link_options("/SUBSYSTEM:CONSOLE,5.01") - # some old lib(for example mkl for xp) use legacy stdio, so we force link legacy_stdio_definitions - add_link_options("/DEFAULTLIB:legacy_stdio_definitions.lib") - - if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__DEPLOY_ON_XP_SP2__=1") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__DEPLOY_ON_XP_SP2__=1") - endif() - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0601") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0601") + # Windows XP sp3 have thread issue, Workround for it + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-") + # for Windows XP type + add_link_options("/SUBSYSTEM:CONSOLE,5.01") + # some old lib(for example mkl for xp) use legacy stdio, so we force link + # legacy_stdio_definitions + add_link_options("/DEFAULTLIB:legacy_stdio_definitions.lib") + + if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__DEPLOY_ON_XP_SP2__=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__DEPLOY_ON_XP_SP2__=1") endif() + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0601") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0601") + endif() endif() if(MSVC OR WIN32) - message(STATUS "windows force cudnn static link") - set(MGE_WITH_CUDNN_SHARED OFF) + message(STATUS "windows force cudnn static link") + set(MGE_WITH_CUDNN_SHARED OFF) endif() if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) - set(MGE_WITH_ANY_CUDA_STUB ON) + set(MGE_WITH_ANY_CUDA_STUB ON) else() - set(MGE_WITH_ANY_CUDA_STUB OFF) + set(MGE_WITH_ANY_CUDA_STUB OFF) endif() if(MGE_WITH_MIDOUT_PROFILE) - message(STATUS "build with MIDOUT PROFILE and force set MGE_WITH_MINIMUM_SIZE off and force rtti ON") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMIDOUT_PROFILING") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMIDOUT_PROFILING") - set(MGE_WITH_MINIMUM_SIZE OFF) - set(MGE_ENABLE_RTTI ON) - if(WIN32) - message(FATAL_ERROR "do not support midout at WIN32") - endif() + message( + STATUS + "build with MIDOUT PROFILE and force set MGE_WITH_MINIMUM_SIZE off and force rtti ON" + ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMIDOUT_PROFILING") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMIDOUT_PROFILING") + set(MGE_WITH_MINIMUM_SIZE OFF) + set(MGE_ENABLE_RTTI ON) + if(WIN32) + message(FATAL_ERROR "do not support midout at WIN32") + endif() endif() set(BIN_REDUCE ${PROJECT_SOURCE_DIR}/src/bin_reduce_cmake.h) if(MGE_WITH_MINIMUM_SIZE) - message(STATUS "build with MGE_WITH_MINIMUM_SIZE bin_reduce header is: ${BIN_REDUCE}") - set(MGE_ENABLE_RTTI OFF) - set(MGE_ENABLE_LOGGING OFF) - set(MGE_ENABLE_EXCEPTIONS OFF) - set(MGE_INFERENCE_ONLY ON) - # MGE_WITH_MINIMUM_SIZE will triger unused-parameter - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter") + message(STATUS "build with MGE_WITH_MINIMUM_SIZE bin_reduce header is: ${BIN_REDUCE}") + set(MGE_ENABLE_RTTI OFF) + set(MGE_ENABLE_LOGGING OFF) + set(MGE_ENABLE_EXCEPTIONS OFF) + set(MGE_INFERENCE_ONLY ON) + # MGE_WITH_MINIMUM_SIZE will triger unused-parameter + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter") endif() if(NOT MGE_WITH_MIDOUT_PROFILE AND NOT WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${BIN_REDUCE}") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${BIN_REDUCE}") -endif() - -if (NOT APPLE) - # check CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT on APPLE will leak cmake crash - CHECK_CXX_COMPILER_FLAG("-ffunction-sections -fdata-sections -Wl,--gc-sections" CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT) - if(CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections") - endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${BIN_REDUCE}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${BIN_REDUCE}") +endif() + +if(NOT APPLE) + # check CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT on APPLE will leak cmake crash + check_cxx_compiler_flag("-ffunction-sections -fdata-sections -Wl,--gc-sections" + CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT) + if(CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections") + endif() endif() check_ipo_supported(RESULT IS_LTO_SUPPORT OUTPUT output_info) # LLVM on Windows report support LTO, but do not support -flto=full at link stage if(IS_LTO_SUPPORT AND NOT WIN32) - message(STATUS "lto is supported in this compiler") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -flto=full") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -flto=full") + message(STATUS "lto is supported in this compiler") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -flto=full") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -flto=full") else() - message(STATUS "lto is not supported in this compiler") + message(STATUS "lto is not supported in this compiler") endif() -if (APPLE) - set (BUILD_SHARED_LIBS OFF) - message(STATUS "build static for xcode framework require") +if(APPLE) + set(BUILD_SHARED_LIBS OFF) + message(STATUS "build static for xcode framework require") endif() -if (MGE_USE_SYSTEM_LIB) - set (MGE_CUDA_USE_STATIC OFF) +if(MGE_USE_SYSTEM_LIB) + set(MGE_CUDA_USE_STATIC OFF) endif() -if (MGB_WITH_FLATBUFFERS) - set(MGB_ENABLE_FBS_SERIALIZATION ON) +if(MGB_WITH_FLATBUFFERS) + set(MGB_ENABLE_FBS_SERIALIZATION ON) endif() if(CMAKE_TOOLCHAIN_FILE) - message(STATUS "We are cross compiling.") - message(STATUS "config FLATBUFFERS_FLATC_EXECUTABLE to: ${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc") - set(FLATBUFFERS_FLATC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc") - if(ANDROID_TOOLCHAIN_ROOT) - if(NOT "${ANDROID_ARCH_NAME}" STREQUAL "") - set(ANDROID_ARCH ${ANDROID_ARCH_NAME}) - endif() - if(${ANDROID_ARCH} STREQUAL "arm") - set(MGE_ARCH "armv7") - elseif(${ANDROID_ARCH} STREQUAL "arm64") - set(MGE_ARCH "aarch64") - else() - message(FATAL_ERROR "DO NOT SUPPORT ANDROID ARCH NOW") - endif() - elseif(IOS_TOOLCHAIN_ROOT) - if(${IOS_ARCH} STREQUAL "armv7") - set(MGE_ARCH "armv7") - elseif(${IOS_ARCH} STREQUAL "arm64") - set(MGE_ARCH "aarch64") - elseif(${IOS_ARCH} STREQUAL "armv7k") - set(MGE_ARCH "armv7") - elseif(${IOS_ARCH} STREQUAL "arm64e") - set(MGE_ARCH "aarch64") - elseif(${IOS_ARCH} STREQUAL "armv7s") - set(MGE_ARCH "armv7") - else() - message(FATAL_ERROR "Unsupported IOS_ARCH.") - endif() - elseif(RISCV_TOOLCHAIN_ROOT) - set(MGE_ARCH "riscv64") - elseif(NOT "${ARM_CROSS_BUILD_ARCH}" STREQUAL "") - set(MGE_ARCH ${ARM_CROSS_BUILD_ARCH}) + message(STATUS "We are cross compiling.") + message( + STATUS + "config FLATBUFFERS_FLATC_EXECUTABLE to: ${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc" + ) + set(FLATBUFFERS_FLATC_EXECUTABLE + "${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc") + if(ANDROID_TOOLCHAIN_ROOT) + if(NOT "${ANDROID_ARCH_NAME}" STREQUAL "") + set(ANDROID_ARCH ${ANDROID_ARCH_NAME}) + endif() + if(${ANDROID_ARCH} STREQUAL "arm") + set(MGE_ARCH "armv7") + elseif(${ANDROID_ARCH} STREQUAL "arm64") + set(MGE_ARCH "aarch64") + else() + message(FATAL_ERROR "DO NOT SUPPORT ANDROID ARCH NOW") + endif() + elseif(IOS_TOOLCHAIN_ROOT) + if(${IOS_ARCH} STREQUAL "armv7") + set(MGE_ARCH "armv7") + elseif(${IOS_ARCH} STREQUAL "arm64") + set(MGE_ARCH "aarch64") + elseif(${IOS_ARCH} STREQUAL "armv7k") + set(MGE_ARCH "armv7") + elseif(${IOS_ARCH} STREQUAL "arm64e") + set(MGE_ARCH "aarch64") + elseif(${IOS_ARCH} STREQUAL "armv7s") + set(MGE_ARCH "armv7") else() - message(FATAL_ERROR "Unknown cross-compiling settings.") + message(FATAL_ERROR "Unsupported IOS_ARCH.") endif() - message(STATUS "CONFIG MGE_ARCH TO ${MGE_ARCH}") + elseif(RISCV_TOOLCHAIN_ROOT) + set(MGE_ARCH "riscv64") + elseif(NOT "${ARM_CROSS_BUILD_ARCH}" STREQUAL "") + set(MGE_ARCH ${ARM_CROSS_BUILD_ARCH}) + else() + message(FATAL_ERROR "Unknown cross-compiling settings.") + endif() + message(STATUS "CONFIG MGE_ARCH TO ${MGE_ARCH}") endif() if(${MGE_ARCH} STREQUAL "AUTO") - if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64") - set(MGE_ARCH "x86_64") - elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686") - set(MGE_ARCH "i386") - elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64") - set(MGE_ARCH "aarch64") - elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm") - set(MGE_ARCH "armv7") - else() - message(FATAL_ERROR "Unknown machine architecture for MegEngine.") - endif() + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL + "AMD64") + set(MGE_ARCH "x86_64") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} + STREQUAL "i686") + set(MGE_ARCH "i386") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} + STREQUAL "arm64") + set(MGE_ARCH "aarch64") + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm") + set(MGE_ARCH "armv7") + else() + message(FATAL_ERROR "Unknown machine architecture for MegEngine.") + endif() endif() if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) - message(STATUS "Setting build type to 'RelWithDebInfo' as none was specified.") - set(CMAKE_BUILD_TYPE RelWithDebInfo) + message(STATUS "Setting build type to 'RelWithDebInfo' as none was specified.") + set(CMAKE_BUILD_TYPE RelWithDebInfo) endif() -if(${CMAKE_BUILD_TYPE} STREQUAL "Release" AND NOT MGE_WITH_TEST AND NOT ${MGE_ARCH} STREQUAL "x86_64" AND NOT MGE_WITH_MIDOUT_PROFILE) - set(MGE_ENABLE_RTTI OFF) - message(STATUS "disable MGE_ENABLE_RTTI when Release/NON-x86_64/NON-MGE_WITH_MIDOUT_PROFILE mode!!") +if(${CMAKE_BUILD_TYPE} STREQUAL "Release" + AND NOT MGE_WITH_TEST + AND NOT ${MGE_ARCH} STREQUAL "x86_64" + AND NOT MGE_WITH_MIDOUT_PROFILE) + set(MGE_ENABLE_RTTI OFF) + message( + STATUS + "disable MGE_ENABLE_RTTI when Release/NON-x86_64/NON-MGE_WITH_MIDOUT_PROFILE mode!!" + ) endif() if(MSVC OR WIN32) - # for cmake after 3.15.2 - cmake_policy(SET CMP0091 NEW) - set(CMAKE_OBJECT_PATH_MAX 300) - if(MGE_BUILD_WITH_ASAN) - set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON) - message(STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows MGE_BUILD_WITH_ASAN") - endif() - if(MGE_STATIC_LINK_WITH_VC_RUNTIME) - if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug") - else() - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded") - endif() + # for cmake after 3.15.2 + cmake_policy(SET CMP0091 NEW) + set(CMAKE_OBJECT_PATH_MAX 300) + if(MGE_BUILD_WITH_ASAN) + set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON) + message( + STATUS + "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows MGE_BUILD_WITH_ASAN" + ) + endif() + if(MGE_STATIC_LINK_WITH_VC_RUNTIME) + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug") else() - if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL") - else() - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL") - endif() + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded") endif() - - add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) - message(STATUS "into windows build CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") - if (NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl") - message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md") - endif() - # on windows need append VS_PATH/VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows - # and VS_PATH/VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows to PATH env - if (MGE_BUILD_WITH_ASAN) - message(WARNING "please do (set)export ASAN_OPTIONS=windows_hook_rtl_allocators=true when run test after build finish, caused by we link asan dll!!") - if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - message(WARNING "Windows AddressSanitizer doesn't support linking with debug runtime libraries yet, which means do not support CMAKE_BUILD_TYPE=Debug") - message(FATAL_ERROR "Please build with RelWithDebInfo or Release by : EXTRA_CMAKE_ARGS=\"-DMGE_BUILD_WITH_ASAN=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ...\"") - endif() - if("$ENV{VS_PATH}" STREQUAL "") - message(FATAL_ERROR "can not find VS_PATH, please export Visual Studio root dir to VS_PATH env") - endif() - if(${MGE_ARCH} STREQUAL "x86_64") - set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-x86_64.lib") - set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-x86_64") - set(WINDOWS_ASAN_PATH_SUFFIXES "VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows") - elseif(${MGE_ARCH} STREQUAL "i386") - set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-i386.lib") - set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-i386.lib") - set(WINDOWS_ASAN_PATH_SUFFIXES "VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows") - else() - message(FATAL_ERROR "unsupport asan ARCH: ${MGE_ARCH} on Windows") - endif() - find_path(ASAN_DLL_PATH - NAMES ${WINDOWS_ASAN_DLL_NAME} - HINTS $ENV{VS_PATH} - PATH_SUFFIXES ${WINDOWS_ASAN_PATH_SUFFIXES} - DOC "Windows asan library path" ) - if(ASAN_DLL_PATH STREQUAL "ASAN_DLL_PATH-NOTFOUND") - message(FATAL_ERROR "can not find asan dll, please upgrade you LLVM") - endif() - - message(STATUS "Windows asan dll path: ${ASAN_DLL_PATH}") - link_directories(${ASAN_DLL_PATH}) - link_libraries(${WINDOWS_ASAN_DLL_NAME}) - link_libraries(${WINDOWS_ASAN_RUNTIME_THUNK_NAME}) - set(WIN_FLAGS "/Od -DNDEBUG -fsanitize=address") - # windows Llvm asan do not take effect when /O2 - # RELWITHDEBINFO default value is /O2, so override it - set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG") - set(CMAKE_C_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG") - set(CMAKE_CXX_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG") + else() + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL") else() - set(WIN_FLAGS "/O2") + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL") endif() - # add flags for enable sse instruction optimize for X86, enable avx header to compile avx code - set(WIN_FLAGS "${WIN_FLAGS} -msse4.2 -D_AVX_ -D_AVX2_ -D__AVX__ -D__AVX2__ -D__FMA__") - # if u CPU is cascadelake series, u can enable for performance - # set(WIN_FLAGS "{WIN_FLAGS} -march=cascadelake -mtune=cascadelake") - # set(WIN_FLAGS "{WIN_FLAGS} -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vnni") - - # for windows build - set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=implicit-int-conversion -Wno-error=double-promotion") - set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion") - set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default") - set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break") - set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /wd4819") - set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj") - set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport") - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") - - #FIXME: fix halide JIT on windows - message(STATUS "disable jit, halide and mlir on windows host build...") - set(MGE_WITH_HALIDE OFF) - set(MGE_WITH_JIT OFF) - set(MGE_WITH_JIT_MLIR OFF) - #FIXME: fix MegRay on windows - message(STATUS "Disable distributed build on windows host build...") - set(MGE_WITH_DISTRIBUTED OFF) -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") - - # NONE windows DEBUG general flags - if(MGE_BUILD_WITH_ASAN) - set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer") - set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer") - else() - set(CMAKE_C_FLAGS_DEBUG "-O0 -g") - set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") + endif() + + add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) + message(STATUS "into windows build CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") + if(NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND NOT ${CMAKE_C_COMPILER_ID} + STREQUAL "Clang-cl") + message( + FATAL_ERROR + "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md" + ) + endif() + # on windows need append + # VS_PATH/VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows and + # VS_PATH/VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows to PATH + # env + if(MGE_BUILD_WITH_ASAN) + message( + WARNING + "please do (set)export ASAN_OPTIONS=windows_hook_rtl_allocators=true when run test after build finish, caused by we link asan dll!!" + ) + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + message( + WARNING + "Windows AddressSanitizer doesn't support linking with debug runtime libraries yet, which means do not support CMAKE_BUILD_TYPE=Debug" + ) + message( + FATAL_ERROR + "Please build with RelWithDebInfo or Release by : EXTRA_CMAKE_ARGS=\"-DMGE_BUILD_WITH_ASAN=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ...\"" + ) endif() - - # NONE windows opt general flags - if (MGE_BUILD_WITH_ASAN) - set(OPTIMIZE_LEVEL "-g -O0 -DNDEBUG -fsanitize=address -fno-omit-frame-pointer") - elseif(ANDROID) - set(OPTIMIZE_LEVEL "-g -Ofast -DNDEBUG") - else() - set(OPTIMIZE_LEVEL "-g -O3 -DNDEBUG") + if("$ENV{VS_PATH}" STREQUAL "") + message( + FATAL_ERROR + "can not find VS_PATH, please export Visual Studio root dir to VS_PATH env") endif() - #remove finite-math-only opt from Ofast, caused by clang have a different - #runtime finite math logic, this issue do not find at g++, but as a unity - #build flags, we force add -fno-finite-math-only when compiler support - CHECK_CXX_COMPILER_FLAG("-fno-finite-math-only" CXX_NO_FINITE_MATH_ONLY_SUPPORT) - if(CXX_NO_FINITE_MATH_ONLY_SUPPORT) - message(STATUS "force add -fno-finite-math-only for this compiler") - set(OPTIMIZE_LEVEL "${OPTIMIZE_LEVEL} -fno-finite-math-only") + if(${MGE_ARCH} STREQUAL "x86_64") + set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-x86_64.lib") + set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-x86_64") + set(WINDOWS_ASAN_PATH_SUFFIXES + "VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows") + elseif(${MGE_ARCH} STREQUAL "i386") + set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-i386.lib") + set(WINDOWS_ASAN_RUNTIME_THUNK_NAME + "clang_rt.asan_dynamic_runtime_thunk-i386.lib") + set(WINDOWS_ASAN_PATH_SUFFIXES + "VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows") + else() + message(FATAL_ERROR "unsupport asan ARCH: ${MGE_ARCH} on Windows") endif() - set(CMAKE_C_FLAGS_RELEASE "${OPTIMIZE_LEVEL}") - set(CMAKE_CXX_FLAGS_RELEASE "${OPTIMIZE_LEVEL}") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}") - #some gnu(gcc) compiler use -static -libasan have runtime issue - #also, when target is big, clang ld will take a long long long - #time when use -static-libsan, so we use dynamic asan by default - #ANDROID asan.so depends on log, so broadcast log link_libraries - #for megengine depends target, for example flatc target - if (MGE_BUILD_WITH_ASAN AND ANDROID) - link_libraries(log) + find_path( + ASAN_DLL_PATH + NAMES ${WINDOWS_ASAN_DLL_NAME} + HINTS $ENV{VS_PATH} + PATH_SUFFIXES ${WINDOWS_ASAN_PATH_SUFFIXES} + DOC "Windows asan library path") + if(ASAN_DLL_PATH STREQUAL "ASAN_DLL_PATH-NOTFOUND") + message(FATAL_ERROR "can not find asan dll, please upgrade you LLVM") endif() + + message(STATUS "Windows asan dll path: ${ASAN_DLL_PATH}") + link_directories(${ASAN_DLL_PATH}) + link_libraries(${WINDOWS_ASAN_DLL_NAME}) + link_libraries(${WINDOWS_ASAN_RUNTIME_THUNK_NAME}) + set(WIN_FLAGS "/Od -DNDEBUG -fsanitize=address") + # windows Llvm asan do not take effect when /O2 RELWITHDEBINFO default value is /O2, + # so override it + set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG") + set(CMAKE_C_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG") + else() + set(WIN_FLAGS "/O2") + endif() + # add flags for enable sse instruction optimize for X86, enable avx header to compile + # avx code + set(WIN_FLAGS "${WIN_FLAGS} -msse4.2 -D_AVX_ -D_AVX2_ -D__AVX__ -D__AVX2__ -D__FMA__") + # if u CPU is cascadelake series, u can enable for performance set(WIN_FLAGS + # "{WIN_FLAGS} -march=cascadelake -mtune=cascadelake") set(WIN_FLAGS "{WIN_FLAGS} + # -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vnni") + + # for windows build + set(WIN_FLAGS + "${WIN_FLAGS} -Wno-error=implicit-int-conversion -Wno-error=double-promotion") + set(WIN_FLAGS + "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion" + ) + set(WIN_FLAGS + "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default" + ) + set(WIN_FLAGS + "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break" + ) + set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /wd4819") + set(WIN_FLAGS + "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj" + ) + set(WIN_FLAGS + "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport" + ) + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") + + # FIXME: fix halide JIT on windows + message(STATUS "disable jit, halide and mlir on windows host build...") + set(MGE_WITH_HALIDE OFF) + set(MGE_WITH_JIT OFF) + set(MGE_WITH_JIT_MLIR OFF) + # FIXME: fix MegRay on windows + message(STATUS "Disable distributed build on windows host build...") + set(MGE_WITH_DISTRIBUTED OFF) +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") + + # NONE windows DEBUG general flags + if(MGE_BUILD_WITH_ASAN) + set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer") + set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer") + else() + set(CMAKE_C_FLAGS_DEBUG "-O0 -g") + set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") + endif() + + # NONE windows opt general flags + if(MGE_BUILD_WITH_ASAN) + set(OPTIMIZE_LEVEL "-g -O0 -DNDEBUG -fsanitize=address -fno-omit-frame-pointer") + elseif(ANDROID) + set(OPTIMIZE_LEVEL "-g -Ofast -DNDEBUG") + else() + set(OPTIMIZE_LEVEL "-g -O3 -DNDEBUG") + endif() + # remove finite-math-only opt from Ofast, caused by clang have a different runtime + # finite math logic, this issue do not find at g++, but as a unity build flags, we + # force add -fno-finite-math-only when compiler support + check_cxx_compiler_flag("-fno-finite-math-only" CXX_NO_FINITE_MATH_ONLY_SUPPORT) + if(CXX_NO_FINITE_MATH_ONLY_SUPPORT) + message(STATUS "force add -fno-finite-math-only for this compiler") + set(OPTIMIZE_LEVEL "${OPTIMIZE_LEVEL} -fno-finite-math-only") + endif() + set(CMAKE_C_FLAGS_RELEASE "${OPTIMIZE_LEVEL}") + set(CMAKE_CXX_FLAGS_RELEASE "${OPTIMIZE_LEVEL}") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}") + # some gnu(gcc) compiler use -static -libasan have runtime issue also, when target is + # big, clang ld will take a long long long time when use -static-libsan, so we use + # dynamic asan by default ANDROID asan.so depends on log, so broadcast log + # link_libraries for megengine depends target, for example flatc target + if(MGE_BUILD_WITH_ASAN AND ANDROID) + link_libraries(log) + endif() endif() if(MGE_WITH_CUDA) -include(cmake/cudnn.cmake) - if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) - message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") - set(MGE_WITH_LARGE_ARCHIVE ON) - endif() -endif() -CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) + include(cmake/cudnn.cmake) + if(MGE_CUDA_USE_STATIC + AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL + "8.0.0") + AND (NOT MGE_WITH_CUDNN_SHARED)) + message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") + set(MGE_WITH_LARGE_ARCHIVE ON) + endif() +endif() +check_cxx_compiler_flag(-fuse-ld=gold CXX_SUPPORT_GOLD) if(MGE_WITH_LARGE_ARCHIVE) - message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") - set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") -elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE) - message(STATUS "Using GNU gold linker.") - set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") + message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") + set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") +elseif( + CXX_SUPPORT_GOLD + AND NOT ANDROID + AND NOT APPLE + AND NOT MSVC + AND NOT WIN32 + AND NOT MGE_WITH_LARGE_ARCHIVE) + message(STATUS "Using GNU gold linker.") + set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") endif() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") if(NOT MGE_WITH_JIT) - if(MGE_WITH_HALIDE) - message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") - set(MGE_WITH_HALIDE OFF) - endif() - if(MGE_WITH_JIT_MLIR) - message(WARNING "MGE_WITH_JIT_MLIR is set to OFF with MGE_WITH_JIT disabled") - set(MGE_WITH_JIT_MLIR OFF) - endif() + if(MGE_WITH_HALIDE) + message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") + set(MGE_WITH_HALIDE OFF) + endif() + if(MGE_WITH_JIT_MLIR) + message(WARNING "MGE_WITH_JIT_MLIR is set to OFF with MGE_WITH_JIT disabled") + set(MGE_WITH_JIT_MLIR OFF) + endif() endif() -# FIXME At present, there are some conflicts between the LLVM that halide -# depends on and the LLVM that MLIR depends on. Should be fixed in subsequent -# versions. +# FIXME At present, there are some conflicts between the LLVM that halide depends on and +# the LLVM that MLIR depends on. Should be fixed in subsequent versions. if(MGE_BUILD_IMPERATIVE_RT AND MGE_WITH_HALIDE) - message(FATAL_ERROR "cannot use HALIDE when building IMPERATIVE_RT") + message(FATAL_ERROR "cannot use HALIDE when building IMPERATIVE_RT") endif() if(MGE_WITH_JIT_MLIR AND MGE_WITH_HALIDE) - message(FATAL_ERROR "cannot use HALIDE with MGE_WITH_JIT_MLIR enabled") + message(FATAL_ERROR "cannot use HALIDE with MGE_WITH_JIT_MLIR enabled") endif() if(MGE_WITH_CUDA) - # FIXME: check_language(CUDA) failed when sbsa mode! - # detail: https://gitlab.kitware.com/cmake/cmake/-/issues/20676 - if(CMAKE_TOOLCHAIN_FILE) - set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - message(WARNING "force set CMAKE_CUDA_HOST_COMPILER to CMAKE_CXX_COMPILER when nvcc sbsa mode!!") - endif() + # FIXME: check_language(CUDA) failed when sbsa mode! detail: + # https://gitlab.kitware.com/cmake/cmake/-/issues/20676 + if(CMAKE_TOOLCHAIN_FILE) + set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + message( + WARNING + "force set CMAKE_CUDA_HOST_COMPILER to CMAKE_CXX_COMPILER when nvcc sbsa mode!!" + ) + endif() - include(CheckLanguage) - check_language(CUDA) - if(NOT CMAKE_CUDA_COMPILER AND NOT CMAKE_TOOLCHAIN_FILE) - message(FATAL_ERROR "CUDA compiler not found in PATH") - endif() + include(CheckLanguage) + check_language(CUDA) + if(NOT CMAKE_CUDA_COMPILER AND NOT CMAKE_TOOLCHAIN_FILE) + message(FATAL_ERROR "CUDA compiler not found in PATH") + endif() - # remove this after CMAKE fix nvcc sbsa - if(NOT CMAKE_CUDA_COMPILER AND CMAKE_TOOLCHAIN_FILE) - set(CMAKE_CUDA_COMPILER "nvcc") - message(WARNING "force set CMAKE_CUDA_COMPILER to nvcc when nvcc sbsa mode!!") - endif() + # remove this after CMAKE fix nvcc sbsa + if(NOT CMAKE_CUDA_COMPILER AND CMAKE_TOOLCHAIN_FILE) + set(CMAKE_CUDA_COMPILER "nvcc") + message(WARNING "force set CMAKE_CUDA_COMPILER to nvcc when nvcc sbsa mode!!") + endif() - enable_language(CUDA) - set(CMAKE_CUDA_STANDARD 14) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) endif() if(NOT MGE_WITH_CUDA) - if(NOT MGE_ARCH STREQUAL "x86_64" AND NOT MGE_ARCH STREQUAL "i386") - message(STATUS "Disable JIT support, as the MGE_ARCH is not X86 and CUDA is not enabled.") - set(MGE_WITH_JIT OFF) - set(MGE_WITH_JIT_MLIR OFF) - endif() - set(MGE_WITH_HALIDE OFF) - message(STATUS "Disable TensorRT support, as CUDA is not enabled.") - set(MGE_WITH_TRT OFF) + if(NOT MGE_ARCH STREQUAL "x86_64" AND NOT MGE_ARCH STREQUAL "i386") + message( + STATUS "Disable JIT support, as the MGE_ARCH is not X86 and CUDA is not enabled.") + set(MGE_WITH_JIT OFF) + set(MGE_WITH_JIT_MLIR OFF) + endif() + set(MGE_WITH_HALIDE OFF) + message(STATUS "Disable TensorRT support, as CUDA is not enabled.") + set(MGE_WITH_TRT OFF) endif() find_package(PythonInterp 3 REQUIRED) -# NOTICE: just use for target, which do not depend on python api -# PURPOSE: reuse target obj when switch python3 version -# will fallback to PYTHON_EXECUTABLE if can not find in PATH env +# NOTICE: just use for target, which do not depend on python api PURPOSE: reuse target +# obj when switch python3 version will fallback to PYTHON_EXECUTABLE if can not find in +# PATH env set(PYTHON3_IN_ENV "python3") find_program(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV}) -if (PYTHON3_EXECUTABLE_WITHOUT_VERSION) - message(STATUS "use ${PYTHON3_IN_ENV} as PYTHON3_EXECUTABLE_WITHOUT_VERSION") - set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV}) +if(PYTHON3_EXECUTABLE_WITHOUT_VERSION) + message(STATUS "use ${PYTHON3_IN_ENV} as PYTHON3_EXECUTABLE_WITHOUT_VERSION") + set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV}) else() - message(STATUS "fallback ${PYTHON_EXECUTABLE} as PYTHON3_EXECUTABLE_WITHOUT_VERSION,\ - target which depend on PYTHON3_EXECUTABLE_WITHOUT_VERSION will be rebuild when switch python3") - set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON_EXECUTABLE}) + message( + STATUS + "fallback ${PYTHON_EXECUTABLE} as PYTHON3_EXECUTABLE_WITHOUT_VERSION,\ + target which depend on PYTHON3_EXECUTABLE_WITHOUT_VERSION will be rebuild when switch python3" + ) + set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON_EXECUTABLE}) endif() set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads) if(NOT "${CMAKE_THREAD_LIBS_INIT}" STREQUAL "") - if(${CMAKE_THREAD_LIBS_INIT} STREQUAL "-pthread" AND MGE_WITH_CUDA) - set_property(TARGET Threads::Threads - PROPERTY INTERFACE_COMPILE_OPTIONS "$<$:-Xcompiler=-pthread>" - "$<$>:-pthread>") - endif() -endif() - -set(MGE_BLAS MKL CACHE STRING "BLAS implementaion used by MegEngine.") + if(${CMAKE_THREAD_LIBS_INIT} STREQUAL "-pthread" AND MGE_WITH_CUDA) + set_property( + TARGET Threads::Threads + PROPERTY INTERFACE_COMPILE_OPTIONS + "$<$:-Xcompiler=-pthread>" + "$<$>:-pthread>") + endif() +endif() + +set(MGE_BLAS + MKL + CACHE STRING "BLAS implementaion used by MegEngine.") set_property(CACHE MGE_BLAS PROPERTY STRINGS MKL OpenBLAS) -set(MGE_CUDA_GENCODE "" CACHE STRING "Overwrite -gencode specifications for CUDA") +set(MGE_CUDA_GENCODE + "" + CACHE STRING "Overwrite -gencode specifications for CUDA") if(NOT CMAKE_CUDA_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER $(CMAKE_CXX_COMPILER)) + set(CMAKE_CUDA_HOST_COMPILER $(CMAKE_CXX_COMPILER)) endif() if(NOT MGE_ENABLE_RTTI) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") endif() if(NOT MGE_ENABLE_EXCEPTIONS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") endif() if(MGE_WITH_TEST) - include(cmake/gtest.cmake) + include(cmake/gtest.cmake) endif() include(cmake/gflags.cmake) if(MGE_BUILD_IMPERATIVE_RT) - set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD 17) endif() if(NOT ${MGE_WITH_CUDA} AND NOT ${MGE_WITH_ROCM}) - message(STATUS "Disable distributed support, as both CUDA and ROCm are disabled.") - set(MGE_WITH_DISTRIBUTED OFF) + message(STATUS "Disable distributed support, as both CUDA and ROCm are disabled.") + set(MGE_WITH_DISTRIBUTED OFF) endif() if(MGE_INFERENCE_ONLY) - message(STATUS "Disable distributed support for inference only build.") - set(MGE_WITH_DISTRIBUTED OFF) - message(STATUS "Disable imperative_rt python module for inference only build.") - set(MGE_BUILD_IMPERATIVE_RT OFF) + message(STATUS "Disable distributed support for inference only build.") + set(MGE_WITH_DISTRIBUTED OFF) + message(STATUS "Disable imperative_rt python module for inference only build.") + set(MGE_BUILD_IMPERATIVE_RT OFF) endif() if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT) - include(cmake/llvm-project.cmake) + include(cmake/llvm-project.cmake) endif() if(MGE_WITH_DISTRIBUTED) - include(cmake/protobuf.cmake) - include(cmake/zmq.cmake) + include(cmake/protobuf.cmake) + include(cmake/zmq.cmake) endif() if(MGB_WITH_FLATBUFFERS) - include(cmake/flatbuffers.cmake) + include(cmake/flatbuffers.cmake) endif() if(MGE_WITH_CUDA) - include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) - get_filename_component(_NAME ${path} NAME) - if(NOT ${_NAME} STREQUAL "stubs") - list(APPEND CUDA_LINK_DIRECTORIES ${path}) - endif() - endforeach() - link_directories(${CUDA_LINK_DIRECTORIES}) - - set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3") - set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") - set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") - if(MSVC OR WIN32) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all") - set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj") - if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd") - endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ") - else() - set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") - endif() - - if(NOT MGE_ENABLE_RTTI) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) + get_filename_component(_NAME ${path} NAME) + if(NOT ${_NAME} STREQUAL "stubs") + list(APPEND CUDA_LINK_DIRECTORIES ${path}) endif() - if(NOT MGE_ENABLE_EXCEPTIONS) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") + endforeach() + link_directories(${CUDA_LINK_DIRECTORIES}) + + set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") + if(MSVC OR WIN32) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all") + set(CCBIN_FLAG + "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj" + ) + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd") endif() - if(NOT MGE_CUDA_GENCODE) - if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "aarch64") - set(MEGDNN_THREADS_512 0) - if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) - message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") - elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") - elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") - elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=compute_75") - elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "9.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "9.0.0") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=compute_70") - else() - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_35,code=sm_35") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") - set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=compute_61") - endif() - else() - message(FATAL_ERROR "Unsupported CUDA host arch.") - endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ") + else() + set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") + endif() + + if(NOT MGE_ENABLE_RTTI) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") + endif() + if(NOT MGE_ENABLE_EXCEPTIONS) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") + endif() + if(NOT MGE_CUDA_GENCODE) + if(${MGE_ARCH} STREQUAL "x86_64" + OR ${MGE_ARCH} STREQUAL "i386" + OR ${MGE_ARCH} STREQUAL "aarch64") + set(MEGDNN_THREADS_512 0) + if(MGE_WITH_CUDA + AND MGE_CUDA_USE_STATIC + AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" + VERSION_EQUAL "8.0.0") + AND (NOT MGE_WITH_CUDNN_SHARED)) + message( + WARNING + "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON" + ) + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") + set(MGE_CUDA_GENCODE + "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") + set(MGE_CUDA_GENCODE + "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") + set(MGE_CUDA_GENCODE + "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=compute_75") + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "9.0.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "9.0.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE + "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=compute_70") + else() + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_35,code=sm_35") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE + "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=compute_61") + endif() else() - set(MEGDNN_THREADS_512 1) + message(FATAL_ERROR "Unsupported CUDA host arch.") endif() - - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") + else() + set(MEGDNN_THREADS_512 1) + endif() + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") + if(MGE_WITH_TRT) + include(cmake/tensorrt.cmake) + endif() + if(MGE_CUDA_USE_STATIC) if(MGE_WITH_TRT) - include(cmake/tensorrt.cmake) + if(MSVC OR WIN32) + message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") + list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${TRT_PLUGIN_LIBRARY}) + else() + list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libnvinfer_plugin + -Wl,--no-whole-archive) + endif() + if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) + message(STATUS "handle trt myelin lib after trt7") + list(APPEND MGE_CUDA_LIBS libmyelin_compiler libmyelin_executor + libmyelin_pattern_runtime libmyelin_pattern_library) + endif() endif() - if(MGE_CUDA_USE_STATIC) - if(MGE_WITH_TRT) - if(MSVC OR WIN32) - message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") - list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${TRT_PLUGIN_LIBRARY}) - else() - list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libnvinfer_plugin -Wl,--no-whole-archive) - endif() - if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) - message(STATUS "handle trt myelin lib after trt7") - list(APPEND MGE_CUDA_LIBS libmyelin_compiler libmyelin_executor libmyelin_pattern_runtime libmyelin_pattern_library) - endif() - endif() - - if("${CUDNN_VERSION}" STREQUAL "7.5.0") - if(MSVC OR WIN32) - message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") - list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) - else() - message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html") - list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) - endif() - else() - if(MSVC OR WIN32) - message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") - list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) - else() - list(APPEND MGE_CUDA_LIBS libcudnn) - endif() - endif() - if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) - else() - list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) - endif() - if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS cublas.lib) - else() - if(MGE_WITH_CUBLAS_SHARED) - list(APPEND MGE_CUDA_LIBS cublas) - else() - list(APPEND MGE_CUDA_LIBS cublas_static) - endif() - endif() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") - if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS cublasLt.lib) - else() - if(MGE_WITH_CUBLAS_SHARED) - list(APPEND MGE_CUDA_LIBS cublasLt) - else() - list(APPEND MGE_CUDA_LIBS cublasLt_static culibos) - endif() - endif() - endif() - if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) - # mark all symbols from liblapack_static.a as weak to avoid - # duplicated definition with mkl - find_library( - LAPACK_STATIC_PATH lapack_static - HINTS ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) - if(NOT LAPACK_STATIC_PATH) - message(FATAL_ERROR "liblapack_static.a not found") - endif() - set(LAPACK_STATIC_COPY_PATH ${CMAKE_CURRENT_BINARY_DIR}/liblapack_static_copy.a) - - # add a target that run objcopy - add_custom_command( - OUTPUT ${LAPACK_STATIC_COPY_PATH} - COMMAND ${CMAKE_OBJCOPY} -w -W* ${LAPACK_STATIC_PATH} ${LAPACK_STATIC_COPY_PATH} - VERBATIM) - add_custom_target(lapack_static_weak_target DEPENDS ${LAPACK_STATIC_COPY_PATH}) - - # create a library named "lapack_static_weak" - add_library(lapack_static_weak STATIC IMPORTED GLOBAL) - add_dependencies(lapack_static_weak lapack_static_weak_target) - set_target_properties( - lapack_static_weak PROPERTIES - IMPORTED_LOCATION ${LAPACK_STATIC_COPY_PATH}) - list(APPEND MGE_CUDA_LIBS lapack_static_weak ${LAPACK_STATIC_COPY_PATH}) - endif() + + if("${CUDNN_VERSION}" STREQUAL "7.5.0") + if(MSVC OR WIN32) + message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") + list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) + else() + message( + STATUS + "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html" + ) + list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) + endif() else() - if(MGE_WITH_TRT) - list(APPEND MGE_CUDA_LIBS libnvinfer libnvinfer_plugin) - if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) - message(STATUS "handle trt myelin lib after trt7") - list(APPEND MGE_CUDA_LIBS libmyelin) - endif() - endif() + if(MSVC OR WIN32) + message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") + list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) + else() list(APPEND MGE_CUDA_LIBS libcudnn) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") - list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) - endif() - list(APPEND MGE_CUDA_LIBS cudart) + endif() endif() - - if(NOT MGE_WITH_CUDA_STUB) - if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS cuda.lib) - else() - list(APPEND MGE_CUDA_LIBS cuda) - endif() + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) + else() + list( + APPEND + MGE_CUDA_LIBS + cusolver_static + curand_static + culibos + cudart_static + cusparse_static) endif() - - if(NOT MGE_WITH_NVRTC_STUB) - if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS nvrtc.lib) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cublas.lib) + else() + if(MGE_WITH_CUBLAS_SHARED) + list(APPEND MGE_CUDA_LIBS cublas) + else() + list(APPEND MGE_CUDA_LIBS cublas_static) + endif() + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cublasLt.lib) + else() + if(MGE_WITH_CUBLAS_SHARED) + list(APPEND MGE_CUDA_LIBS cublasLt) else() - list(APPEND MGE_CUDA_LIBS nvrtc) + list(APPEND MGE_CUDA_LIBS cublasLt_static culibos) endif() + endif() endif() - - if(MGE_WITH_ANY_CUDA_STUB) - add_subdirectory(dnn/cuda-stub) - list(APPEND MGE_CUDA_LIBS cuda-stub) + if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") + AND NOT MSVC + AND NOT WIN32) + # mark all symbols from liblapack_static.a as weak to avoid duplicated definition + # with mkl + find_library(LAPACK_STATIC_PATH lapack_static + HINTS ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) + if(NOT LAPACK_STATIC_PATH) + message(FATAL_ERROR "liblapack_static.a not found") + endif() + set(LAPACK_STATIC_COPY_PATH ${CMAKE_CURRENT_BINARY_DIR}/liblapack_static_copy.a) + + # add a target that run objcopy + add_custom_command( + OUTPUT ${LAPACK_STATIC_COPY_PATH} + COMMAND ${CMAKE_OBJCOPY} -w -W* ${LAPACK_STATIC_PATH} ${LAPACK_STATIC_COPY_PATH} + VERBATIM) + add_custom_target(lapack_static_weak_target DEPENDS ${LAPACK_STATIC_COPY_PATH}) + + # create a library named "lapack_static_weak" + add_library(lapack_static_weak STATIC IMPORTED GLOBAL) + add_dependencies(lapack_static_weak lapack_static_weak_target) + set_target_properties(lapack_static_weak PROPERTIES IMPORTED_LOCATION + ${LAPACK_STATIC_COPY_PATH}) + list(APPEND MGE_CUDA_LIBS lapack_static_weak ${LAPACK_STATIC_COPY_PATH}) endif() + else() + if(MGE_WITH_TRT) + list(APPEND MGE_CUDA_LIBS libnvinfer libnvinfer_plugin) + if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) + message(STATUS "handle trt myelin lib after trt7") + list(APPEND MGE_CUDA_LIBS libmyelin) + endif() + endif() + list(APPEND MGE_CUDA_LIBS libcudnn) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" + OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") + list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) + endif() + list(APPEND MGE_CUDA_LIBS cudart) + endif() + if(NOT MGE_WITH_CUDA_STUB) if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS nvrtc.lib) + list(APPEND MGE_CUDA_LIBS cuda.lib) else() - list(APPEND MGE_CUDA_LIBS nvToolsExt) - endif() - - set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") - if(UNIX) - set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -ldl") + list(APPEND MGE_CUDA_LIBS cuda) endif() + endif() -endif() - -###########please add_subdirectory from here############### -if((${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64") AND NOT APPLE AND NOT MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) - option(MGE_ENABLE_CPUINFO "Build cpuinfo library for check runtime." ON) - if(MGE_ENABLE_CPUINFO) - message(STATUS "Enable cpuinfo runtime check and little kernel optimize.") - add_definitions(-DMGB_ENABLE_CPUINFO_CHECK) - include(cmake/cpuinfo.cmake) + if(NOT MGE_WITH_NVRTC_STUB) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS nvrtc.lib) + else() + list(APPEND MGE_CUDA_LIBS nvrtc) endif() + endif() + + if(MGE_WITH_ANY_CUDA_STUB) + add_subdirectory(dnn/cuda-stub) + list(APPEND MGE_CUDA_LIBS cuda-stub) + endif() + + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS nvrtc.lib) + else() + list(APPEND MGE_CUDA_LIBS nvToolsExt) + endif() + + set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") + if(UNIX) + set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -ldl") + endif() + +endif() + +# ##########please add_subdirectory from here############### +if((${MGE_ARCH} STREQUAL "x86_64" + OR ${MGE_ARCH} STREQUAL "i386" + OR ${MGE_ARCH} STREQUAL "armv7" + OR ${MGE_ARCH} STREQUAL "aarch64" + ) + AND NOT APPLE + AND NOT MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) + option(MGE_ENABLE_CPUINFO "Build cpuinfo library for check runtime." ON) + if(MGE_ENABLE_CPUINFO) + message(STATUS "Enable cpuinfo runtime check and little kernel optimize.") + add_definitions(-DMGB_ENABLE_CPUINFO_CHECK) + include(cmake/cpuinfo.cmake) + endif() endif() if(MGE_WITH_CAMBRICON) - include_directories("$ENV{NEUWARE_HOME}/include") - link_directories("$ENV{NEUWARE_HOME}/lib64") - list(APPEND MGE_CAMBRICON_LIBS libcnrt libcndev) - if (CNRT_VERSION_STRING VERSION_GREATER "5.0.0") - include(cmake/cnnl.cmake) - include(cmake/cnlight.cmake) - include(cmake/magicmind.cmake) - list(APPEND MGE_CAMBRICON_LIBS libcnnl libcnnl_extra libcnlight libmagicmind libmagicmind_runtime) - else() - include(cmake/cnml.cmake) - list(APPEND MGE_CAMBRICON_LIBS libcnml) - endif() - set(MGE_CAMBRICON_LIBS "${MGE_CAMBRICON_LIBS}") + include_directories("$ENV{NEUWARE_HOME}/include") + link_directories("$ENV{NEUWARE_HOME}/lib64") + list(APPEND MGE_CAMBRICON_LIBS libcnrt libcndev) + if(CNRT_VERSION_STRING VERSION_GREATER "5.0.0") + include(cmake/cnnl.cmake) + include(cmake/cnlight.cmake) + include(cmake/magicmind.cmake) + list( + APPEND + MGE_CAMBRICON_LIBS + libcnnl + libcnnl_extra + libcnlight + libmagicmind + libmagicmind_runtime) + else() + include(cmake/cnml.cmake) + list(APPEND MGE_CAMBRICON_LIBS libcnml) + endif() + set(MGE_CAMBRICON_LIBS "${MGE_CAMBRICON_LIBS}") +endif() + +if(MGE_WITH_ROCM) + include(cmake/rocm.cmake) endif() -if (MGE_WITH_ROCM) - include(cmake/rocm.cmake) -endif () - if(MGE_WITH_ATLAS) - add_subdirectory(dnn/atlas-stub) - list(APPEND MGE_ATLAS_LIBS atlas-stub) - set(MGE_ATLAS_LIBS "${MGE_ATLAS_LIBS}") - set(MGB_ATLAS ${MGE_WITH_ATLAS}) + add_subdirectory(dnn/atlas-stub) + list(APPEND MGE_ATLAS_LIBS atlas-stub) + set(MGE_ATLAS_LIBS "${MGE_ATLAS_LIBS}") + set(MGB_ATLAS ${MGE_WITH_ATLAS}) endif() find_program(CCACHE_BIN ccache) if(CCACHE_BIN) - set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN}) - if(MGE_WITH_CUDA AND NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0") - message(STATUS "Using ccache as CMAKE_CUDA_COMPILER_LAUNCHER") - set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_BIN}) - endif() + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN}) + if(MGE_WITH_CUDA AND NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0") + message(STATUS "Using ccache as CMAKE_CUDA_COMPILER_LAUNCHER") + set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_BIN}) + endif() endif() if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") - if(${MGE_BLAS} STREQUAL "MKL") - include(cmake/mkl.cmake) - set(MGE_BLAS_LIBS libmkl) - elseif(${MGE_BLAS} STREQUAL "OpenBLAS") - include(cmake/OpenBLAS.cmake) - set(MGE_BLAS_LIBS libopenblas) - else() - message(FATAL_ERROR "Unknown BLAS implementation ${MGE_BLAS}") - endif() + if(${MGE_BLAS} STREQUAL "MKL") + include(cmake/mkl.cmake) + set(MGE_BLAS_LIBS libmkl) + elseif(${MGE_BLAS} STREQUAL "OpenBLAS") + include(cmake/OpenBLAS.cmake) + set(MGE_BLAS_LIBS libopenblas) + else() + message(FATAL_ERROR "Unknown BLAS implementation ${MGE_BLAS}") + endif() endif() # MKLDNN build if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64") - include(cmake/MKL_DNN.cmake) - set(MEGDNN_X86_WITH_MKL_DNN 1) + include(cmake/MKL_DNN.cmake) + set(MEGDNN_X86_WITH_MKL_DNN 1) endif() # RTTI if(MGE_ENABLE_RTTI) - set(MEGDNN_ENABLE_MANGLING 0) - set(MEGDNN_ENABLE_RTTI 1) + set(MEGDNN_ENABLE_MANGLING 0) + set(MEGDNN_ENABLE_RTTI 1) else() - set(MEGDNN_ENABLE_MANGLING 1) - set(MEGDNN_ENABLE_RTTI 0) + set(MEGDNN_ENABLE_MANGLING 1) + set(MEGDNN_ENABLE_RTTI 0) endif() set(MGB_VERBOSE_TYPEINFO_NAME ${MGE_ENABLE_RTTI}) @@ -866,72 +1006,79 @@ set(MGB_ENABLE_JSON ${MGE_ENABLE_LOGGING}) # Exception if(NOT MGE_ENABLE_EXCEPTIONS) - message(STATUS "Exceptions disabled; MegEngine would kill itself when it is supposed to throw an exception.") + message( + STATUS + "Exceptions disabled; MegEngine would kill itself when it is supposed to throw an exception." + ) endif() set(MGB_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) set(MEGDNN_ENABLE_EXCEPTIONS ${MGE_ENABLE_EXCEPTIONS}) # JIT if(MGE_WITH_JIT AND MGE_WITH_HALIDE) - set(HALIDE_SHARED_LIBRARY OFF CACHE BOOL "Build as a shared library") - include(cmake/Halide.cmake) + set(HALIDE_SHARED_LIBRARY + OFF + CACHE BOOL "Build as a shared library") + include(cmake/Halide.cmake) endif() include(cmake/cpp_redis.cmake) # Thread -IF(APPLE) - set(CMAKE_THREAD_LIBS_INIT "-lpthread") - set(CMAKE_HAVE_THREADS_LIBRARY 1) - set(CMAKE_USE_WIN32_THREADS_INIT 0) - set(CMAKE_USE_PTHREADS_INIT 1) - set(THREADS_PREFER_PTHREAD_FLAG ON) - message(STATUS "disable jit, halide and mlir on macos host build...") - set(MGE_WITH_HALIDE OFF) - set(MGE_WITH_JIT OFF) - set(MGE_WITH_JIT_MLIR OFF) -ENDIF() +if(APPLE) + set(CMAKE_THREAD_LIBS_INIT "-lpthread") + set(CMAKE_HAVE_THREADS_LIBRARY 1) + set(CMAKE_USE_WIN32_THREADS_INIT 0) + set(CMAKE_USE_PTHREADS_INIT 1) + set(THREADS_PREFER_PTHREAD_FLAG ON) + message(STATUS "disable jit, halide and mlir on macos host build...") + set(MGE_WITH_HALIDE OFF) + set(MGE_WITH_JIT OFF) + set(MGE_WITH_JIT_MLIR OFF) +endif() set(MGB_JIT ${MGE_WITH_JIT}) set(MGB_JIT_MLIR ${MGE_WITH_JIT_MLIR}) set(MGB_JIT_HALIDE ${MGE_WITH_HALIDE}) # for consumer override MGB_C_OPR_INIT_FUNC symbol interface if(NOT "${CUSTOM_C_OPR_INIT_FUNC}" STREQUAL "") - add_compile_definitions(MGB_C_OPR_INIT_FUNC=${CUSTOM_C_OPR_INIT_FUNC}) - message(STATUS "override MGB_C_OPR_INIT_FUNC to ${CUSTOM_C_OPR_INIT_FUNC}") + add_compile_definitions(MGB_C_OPR_INIT_FUNC=${CUSTOM_C_OPR_INIT_FUNC}) + message(STATUS "override MGB_C_OPR_INIT_FUNC to ${CUSTOM_C_OPR_INIT_FUNC}") endif() set(MGB_CUSTOM_OP ${MGE_WITH_CUSTOM_OP}) if(MSVC OR WIN32) - set(CMAKE_HAVE_THREADS_LIBRARY 1) - set(CMAKE_USE_WIN32_THREADS_INIT 1) - set(CMAKE_USE_PTHREADS_INIT 1) - set(THREADS_PREFER_PTHREAD_FLAG ON) + set(CMAKE_HAVE_THREADS_LIBRARY 1) + set(CMAKE_USE_WIN32_THREADS_INIT 1) + set(CMAKE_USE_PTHREADS_INIT 1) + set(THREADS_PREFER_PTHREAD_FLAG ON) endif() -if(CMAKE_THREAD_LIBS_INIT OR CMAKE_USE_WIN32_THREADS_INIT OR ANDROID) - set(MGB_HAVE_THREAD 1) +if(CMAKE_THREAD_LIBS_INIT + OR CMAKE_USE_WIN32_THREADS_INIT + OR ANDROID) + set(MGB_HAVE_THREAD 1) endif() if(MSVC OR WIN32) - if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) - message(STATUS "disable MGB_HAVE_THREAD/MGB_ENABLE_JSON when DEPLOY ON XP SP2") - set(MGB_HAVE_THREAD 0) - set(MGB_ENABLE_JSON 0) - endif() + if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) + message(STATUS "disable MGB_HAVE_THREAD/MGB_ENABLE_JSON when DEPLOY ON XP SP2") + set(MGB_HAVE_THREAD 0) + set(MGB_ENABLE_JSON 0) + endif() endif() if(MGE_WITH_TEST) - # use intra-op multi threads - set(MEGDNN_ENABLE_MULTI_THREADS 1) + # use intra-op multi threads + set(MEGDNN_ENABLE_MULTI_THREADS 1) endif() # CUDA set(MGB_CUDA ${MGE_WITH_CUDA}) set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA}) -#ROCM +# ROCM set(MGB_ROCM ${MGE_WITH_ROCM}) set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM}) @@ -943,19 +1090,20 @@ set(MGB_ENFLAME ${MGE_WITH_ENFLAME}) set(MEGDNN_WITH_ENFLAME ${MGE_WITH_ENFLAME}) # Debug info -if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") - set(MGB_ASSERT_LOC 1) - set(MGB_ENABLE_DEBUG_UTIL 1) +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL + "RelWithDebInfo") + set(MGB_ASSERT_LOC 1) + set(MGB_ENABLE_DEBUG_UTIL 1) else() - set(MGB_ASSERT_LOC 0) - set(MGB_ENABLE_DEBUG_UTIL 0) + set(MGB_ASSERT_LOC 0) + set(MGB_ENABLE_DEBUG_UTIL 0) endif() if(MSVC OR WIN32) - if(${MGE_ARCH} STREQUAL "i386") - set(MGB_ENABLE_DEBUG_UTIL 0) - message(STATUS "disable MGB_ENABLE_DEBUG_UTIL at Windows i386 build") - endif() + if(${MGE_ARCH} STREQUAL "i386") + set(MGB_ENABLE_DEBUG_UTIL 0) + message(STATUS "disable MGB_ENABLE_DEBUG_UTIL at Windows i386 build") + endif() endif() # TensorRT @@ -963,11 +1111,11 @@ set(MGB_ENABLE_TENSOR_RT ${MGE_WITH_TRT}) # Inference only if(MGE_INFERENCE_ONLY AND NOT MGE_WITH_TEST) - set(MGB_ENABLE_GRAD 0) - set(MGB_BUILD_SLIM_SERVING 1) + set(MGB_ENABLE_GRAD 0) + set(MGB_BUILD_SLIM_SERVING 1) else() - set(MGB_ENABLE_GRAD 1) - set(MGB_BUILD_SLIM_SERVING 0) + set(MGB_ENABLE_GRAD 1) + set(MGB_BUILD_SLIM_SERVING 0) endif() # Distributed communication @@ -975,227 +1123,264 @@ set(MGB_ENABLE_OPR_MM ${MGE_WITH_DISTRIBUTED}) # MGE_ARCH related flags if(MGE_ARCH STREQUAL "x86_64" OR MGE_ARCH STREQUAL "i386") - if(MGE_BLAS STREQUAL "MKL") - set(MEGDNN_X86_WITH_MKL 1) - elseif(MGE_BLAS STREQUAL "OpenBLAS") - set(MEGDNN_X86_WITH_OPENBLAS 1) - endif() + if(MGE_BLAS STREQUAL "MKL") + set(MEGDNN_X86_WITH_MKL 1) + elseif(MGE_BLAS STREQUAL "OpenBLAS") + set(MEGDNN_X86_WITH_OPENBLAS 1) + endif() endif() # Enable Naive if(MGE_ARCH STREQUAL "naive") - set(MEGDNN_NAIVE 1) - message(STATUS "MEGDNN_NAIVE is enabled; MegDNN performance is degraded.") + set(MEGDNN_NAIVE 1) + message(STATUS "MEGDNN_NAIVE is enabled; MegDNN performance is degraded.") endif() if(MGE_ARCH STREQUAL "x86_64" OR MGE_ARCH STREQUAL "i386") - set(MEGDNN_X86 1) - if(MGE_ARCH STREQUAL "x86_64") - set(MEGDNN_X86_64 1) - set(MEGDNN_64_BIT 1) - if(NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") - endif() - else() - set(MEGDNN_X86_32 1) - if(NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") - endif() + set(MEGDNN_X86 1) + if(MGE_ARCH STREQUAL "x86_64") + set(MEGDNN_X86_64 1) + set(MEGDNN_64_BIT 1) + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif() + else() + set(MEGDNN_X86_32 1) if(NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mfpmath=sse") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") endif() + endif() + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mfpmath=sse") + endif() endif() # dotprod is not enable by default on APPLE, cpuinfo has some problem on APPLE if(NOT APPLE AND ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+dotprod" CXX_COMPILER_SUPPORT_DOT) - if(CXX_COMPILER_SUPPORT_DOT) - message(STATUS "Enable dotprod feature in armv8.2-a using MGB_ENABLE_DOT") - set(MGB_ENABLE_DOT 1) - endif() + check_cxx_compiler_flag("-march=armv8.2-a+dotprod" CXX_COMPILER_SUPPORT_DOT) + if(CXX_COMPILER_SUPPORT_DOT) + message(STATUS "Enable dotprod feature in armv8.2-a using MGB_ENABLE_DOT") + set(MGB_ENABLE_DOT 1) + endif() endif() if(MGE_ARCH STREQUAL "armv7") - # -funsafe-math-optimizations to enable neon auto-vectorization (since neon is not fully IEEE 754 compatible, GCC does not turn on neon auto-vectorization by default. - if(ANDROID) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon") - endif() - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funsafe-math-optimizations") - set (MARCH "-march=armv7-a") - set (MEGDNN_ARMV7 1) + # -funsafe-math-optimizations to enable neon auto-vectorization (since neon is not + # fully IEEE 754 compatible, GCC does not turn on neon auto-vectorization by default. + if(ANDROID) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funsafe-math-optimizations") + set(MARCH "-march=armv7-a") + set(MEGDNN_ARMV7 1) endif() if(MGE_ARCH STREQUAL "aarch64") - set(MEGDNN_AARCH64 1) - set(MEGDNN_64_BIT 1) - set(MARCH "-march=armv8-a") - set(MGB_AARCH64 1) - if(MGE_ARMV8_2_FEATURE_FP16) - message(STATUS "Enable fp16 feature support in armv8.2") - if(NOT ${MGE_DISABLE_FLOAT16}) - set(MEGDNN_ENABLE_FP16_NEON 1) - endif() - set(MARCH "-march=armv8.2-a+fp16") + set(MEGDNN_AARCH64 1) + set(MEGDNN_64_BIT 1) + set(MARCH "-march=armv8-a") + set(MGB_AARCH64 1) + if(MGE_ARMV8_2_FEATURE_FP16) + message(STATUS "Enable fp16 feature support in armv8.2") + if(NOT ${MGE_DISABLE_FLOAT16}) + set(MEGDNN_ENABLE_FP16_NEON 1) endif() + set(MARCH "-march=armv8.2-a+fp16") + endif() - if(MGE_WITH_CUDA) - message(WARNING "aarch64 ld will add -mfix-cortex-a53-843419 and -mfix-cortex-a53-835769,\ + if(MGE_WITH_CUDA) + message( + WARNING + "aarch64 ld will add -mfix-cortex-a53-843419 and -mfix-cortex-a53-835769,\ when cuda enable and CMAKE with DEBUG build type,ld will take about 14min+,\ for save link time(14min->1min), you may open below flags if not deploy on\ arm a53 platform, or just build release type!") - #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-fix-cortex-a53-843419 -mno-fix-cortex-a53-835769") - endif() + # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-fix-cortex-a53-843419 + # -mno-fix-cortex-a53-835769") + endif() endif() if(MGE_ARCH STREQUAL "riscv64") - set(MEGDNN_RISCV64 1) - set(MEGDNN_64_BIT 1) + set(MEGDNN_RISCV64 1) + set(MEGDNN_64_BIT 1) endif() -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}") -set(MGE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/src/version.ld CACHE INTERNAL "Path to linker version script") +set(MGE_VERSION_SCRIPT + ${PROJECT_SOURCE_DIR}/src/version.ld + CACHE INTERNAL "Path to linker version script") -# Write out megbrain_build_config.h -# It defines macros needed by both megbrain and dnn -configure_file(src/megbrain_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +# Write out megbrain_build_config.h It defines macros needed by both megbrain and dnn +configure_file(src/megbrain_build_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) add_subdirectory(dnn) -list(APPEND MGB_OPR_PARAM_DEFS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py) +list(APPEND MGB_OPR_PARAM_DEFS_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py) set(MGB_OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py) set(MGB_OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/opr/include/) file(MAKE_DIRECTORY ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr) add_custom_command( - OUTPUT ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${MGB_OPR_PARAM_DEFS_SCRIPT} ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h - DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT} - VERBATIM -) + OUTPUT ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h + COMMAND + ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${MGB_OPR_PARAM_DEFS_SCRIPT} + ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h + DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT} + VERBATIM) list(APPEND MGB_OPR_PARAM_DEFS_OUTS - ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h -) + ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h) -install(FILES ${MGB_OPR_PARAM_DEFS_OUTS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/megbrain/opr/) +install(FILES ${MGB_OPR_PARAM_DEFS_OUTS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/megbrain/opr/) list(APPEND MGB_OPR_PARAM_DEFS_INC ${MGB_OPR_PARAM_DEFS_OUT_DIR}) add_custom_target(_mgb_opr_param_defs DEPENDS ${MGB_OPR_PARAM_DEFS_OUTS}) add_library(mgb_opr_param_defs INTERFACE) -target_include_directories(mgb_opr_param_defs - INTERFACE - $ - $ -) +target_include_directories( + mgb_opr_param_defs INTERFACE $ + $) add_dependencies(mgb_opr_param_defs _mgb_opr_param_defs) install(TARGETS mgb_opr_param_defs EXPORT ${MGE_EXPORT_TARGETS}) if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT) - # generate param_defs.td - set(MGE_GENFILE_DIR ${PROJECT_BINARY_DIR}/src/genfiles) - set(MGE_GEN_IR_DIR ${PROJECT_BINARY_DIR}/src/core/include/megbrain/ir) - set(OPR_PARAM_DEFS_SRCS ${MGE_GENFILE_DIR}/opr_param_defs.py) - set(OPR_PARAM_DEFS_SCRIPT ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_tablegen.py) - set(OPR_PARAM_DEFS_OUT ${MGE_GEN_IR_DIR}/param_defs.td) - file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py DESTINATION ${MGE_GENFILE_DIR}) - file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS) - file(APPEND ${OPR_PARAM_DEFS_SRCS} ${CONTENTS}) - file(MAKE_DIRECTORY ${MGE_GEN_IR_DIR}) - add_custom_command( - OUTPUT ${OPR_PARAM_DEFS_OUT} - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT} - DEPENDS ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py ${OPR_PARAM_DEFS_SCRIPT} - VERBATIM - ) - # mlir tblgen sources - set(MGE_IR_DIR ${PROJECT_SOURCE_DIR}/src/core/include/megbrain/ir) - set(MGE_IR_INCLUDE_DIRS ${MLIR_LLVM_INCLUDE_DIR} ${MGE_IR_DIR} ${MGE_GEN_IR_DIR}) - list(TRANSFORM MGE_IR_INCLUDE_DIRS PREPEND "-I") - file(GLOB_RECURSE MGE_IR_TDS ${MGE_IR_DIR}/*.td) - add_custom_target(param_defs_tblgen DEPENDS ${OPR_PARAM_DEFS_OUT}) + # generate param_defs.td + set(MGE_GENFILE_DIR ${PROJECT_BINARY_DIR}/src/genfiles) + set(MGE_GEN_IR_DIR ${PROJECT_BINARY_DIR}/src/core/include/megbrain/ir) + set(OPR_PARAM_DEFS_SRCS ${MGE_GENFILE_DIR}/opr_param_defs.py) + set(OPR_PARAM_DEFS_SCRIPT ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_tablegen.py) + set(OPR_PARAM_DEFS_OUT ${MGE_GEN_IR_DIR}/param_defs.td) + file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py + DESTINATION ${MGE_GENFILE_DIR}) + file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS) + file(APPEND ${OPR_PARAM_DEFS_SRCS} ${CONTENTS}) + file(MAKE_DIRECTORY ${MGE_GEN_IR_DIR}) + add_custom_command( + OUTPUT ${OPR_PARAM_DEFS_OUT} + COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} + ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT} + DEPENDS ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py + ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py + ${OPR_PARAM_DEFS_SCRIPT} + VERBATIM) + # mlir tblgen sources + set(MGE_IR_DIR ${PROJECT_SOURCE_DIR}/src/core/include/megbrain/ir) + set(MGE_IR_INCLUDE_DIRS ${MLIR_LLVM_INCLUDE_DIR} ${MGE_IR_DIR} ${MGE_GEN_IR_DIR}) + list(TRANSFORM MGE_IR_INCLUDE_DIRS PREPEND "-I") + file(GLOB_RECURSE MGE_IR_TDS ${MGE_IR_DIR}/*.td) + add_custom_target(param_defs_tblgen DEPENDS ${OPR_PARAM_DEFS_OUT}) endif() if(MGE_WITH_DISTRIBUTED) - set(MEGRAY_WITH_NCCL ${MGE_WITH_CUDA} CACHE BOOL "Override MegRay option" FORCE) - set(MEGRAY_WITH_SHM ${MGE_WITH_CUDA} CACHE BOOL "Override MegRay option" FORCE) - set(MEGRAY_WITH_RCCL ${MGE_WITH_ROCM} CACHE BOOL "Override MegRay option" FORCE) - set(MEGRAY_CUDA_GENCODE ${MGE_CUDA_GENCODE} CACHE STRING "Overwrite MegRay CUDA -gencode specifications" FORCE) - add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay) + set(MEGRAY_WITH_NCCL + ${MGE_WITH_CUDA} + CACHE BOOL "Override MegRay option" FORCE) + set(MEGRAY_WITH_SHM + ${MGE_WITH_CUDA} + CACHE BOOL "Override MegRay option" FORCE) + set(MEGRAY_WITH_RCCL + ${MGE_WITH_ROCM} + CACHE BOOL "Override MegRay option" FORCE) + set(MEGRAY_CUDA_GENCODE + ${MGE_CUDA_GENCODE} + CACHE STRING "Overwrite MegRay CUDA -gencode specifications" FORCE) + add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay) endif() add_subdirectory(src) if(MGE_BUILD_IMPERATIVE_RT) - add_subdirectory(imperative) - message(STATUS "Enable imperative python wrapper runtime") + add_subdirectory(imperative) + message(STATUS "Enable imperative python wrapper runtime") endif() if(MGE_WITH_TEST AND MGE_ENABLE_RTTI) - add_subdirectory(test) + add_subdirectory(test) endif() if(TARGET _imperative_rt) - add_custom_target( - develop - COMMAND ${CMAKE_COMMAND} -E create_symlink - ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$ - ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$ - COMMAND ${CMAKE_COMMAND} -E create_symlink - ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py - ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py - DEPENDS _imperative_rt - VERBATIM - ) -endif() - -# Configure and install pkg-config. -# Note that unlike the Config.cmake modules, this is not relocatable (and not -# really portable) because we have two dependencies without pkg-config -# descriptions: FlatBuffers and MKL-DNN -if (MGE_USE_SYSTEM_MKLDNN) - set (MGE_PKGCONFIG_LIBS_PRIVATE "-ldnnl") -endif() -if (MGE_USE_SYSTEM_OPENBLAS) - set (MGE_PKGCONFIG_LIBS_PRIVATE "${MGE_PKGCONFIG_LIBS_PRIVATE} -lopenblas") -endif() -configure_file(cmake/megengine.pc.in - ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc - @ONLY) + add_custom_target( + develop + COMMAND + ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$ + ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$ + COMMAND + ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py + ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py + COMMAND + ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/src/custom/include + ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/include + COMMAND ${CMAKE_COMMAND} -E make_directory + ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib + COMMAND + ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/src/$ + ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib/$ + DEPENDS _imperative_rt + VERBATIM) +endif() + +# Configure and install pkg-config. Note that unlike the Config.cmake modules, this is +# not relocatable (and not really portable) because we have two dependencies without +# pkg-config descriptions: FlatBuffers and MKL-DNN +if(MGE_USE_SYSTEM_MKLDNN) + set(MGE_PKGCONFIG_LIBS_PRIVATE "-ldnnl") +endif() +if(MGE_USE_SYSTEM_OPENBLAS) + set(MGE_PKGCONFIG_LIBS_PRIVATE "${MGE_PKGCONFIG_LIBS_PRIVATE} -lopenblas") +endif() +configure_file(cmake/megengine.pc.in ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) # Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready. -if (NOT MGE_WITH_DISTRIBUTED) - include(CMakePackageConfigHelpers) - set (MGE_INSTALL_CMAKEDIR ${CMAKE_INSTALL_LIBDIR}/cmake/MegEngine) - configure_package_config_file(cmake/MegEngineConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake - INSTALL_DESTINATION ${MGE_INSTALL_CMAKEDIR} - ) - write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake - VERSION ${MGB_VER_STRING} - COMPATIBILITY SameMajorVersion) - - install(EXPORT ${MGE_EXPORT_TARGETS} DESTINATION ${MGE_INSTALL_CMAKEDIR}) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake +if(NOT MGE_WITH_DISTRIBUTED) + include(CMakePackageConfigHelpers) + set(MGE_INSTALL_CMAKEDIR ${CMAKE_INSTALL_LIBDIR}/cmake/MegEngine) + configure_package_config_file( + cmake/MegEngineConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake + INSTALL_DESTINATION ${MGE_INSTALL_CMAKEDIR}) + write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake + VERSION ${MGB_VER_STRING} + COMPATIBILITY SameMajorVersion) + + install(EXPORT ${MGE_EXPORT_TARGETS} DESTINATION ${MGE_INSTALL_CMAKEDIR}) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake - DESTINATION ${MGE_INSTALL_CMAKEDIR}) + DESTINATION ${MGE_INSTALL_CMAKEDIR}) endif() if(MGE_WITH_JIT_MLIR) - add_subdirectory(tools/mlir/mgb-opt) - add_subdirectory(tools/mlir/mgb-file-check) -endif() - -if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) - message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") - message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") - message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") + add_subdirectory(tools/mlir/mgb-opt) + add_subdirectory(tools/mlir/mgb-file-check) +endif() + +if(MGE_WITH_CUDA + AND MGE_CUDA_USE_STATIC + AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL + "8.0.0") + AND (NOT MGE_WITH_CUDNN_SHARED)) + message( + WARNING + "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" " + ) + message( + WARNING + "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" " + ) + message( + WARNING + "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" " + ) endif() if(MGE_WITH_LITE) - add_subdirectory(lite) + add_subdirectory(lite) endif() diff --git a/ci/cmake.sh b/ci/cmake.sh index 8d8c55bf..4808e63e 100755 --- a/ci/cmake.sh +++ b/ci/cmake.sh @@ -27,7 +27,8 @@ function build() { -DMGE_WITH_DISTRIBUTED=${DMGE_WITH_DISTRIBUTED} \ -DMGE_WITH_CUDA=${DMGE_WITH_CUDA} \ -DMGE_WITH_TEST=ON \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DMGE_WITH_CUSTOM_OP=ON make -j$(($(nproc) * 2)) -I ${build_dir} make develop popd >/dev/null diff --git a/cmake/BuildFlatBuffers.cmake b/cmake/BuildFlatBuffers.cmake index 91cf5f97..ea85ae99 100644 --- a/cmake/BuildFlatBuffers.cmake +++ b/cmake/BuildFlatBuffers.cmake @@ -1,59 +1,56 @@ # Copyright 2015 Google Inc. All rights reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. -# General function to create FlatBuffer build rules for the given list of -# schemas. +# General function to create FlatBuffer build rules for the given list of schemas. # # flatbuffers_schemas: A list of flatbuffer schema files to process. # -# schema_include_dirs: A list of schema file include directories, which will be -# passed to flatc via the -I parameter. +# schema_include_dirs: A list of schema file include directories, which will be passed +# to flatc via the -I parameter. # -# custom_target_name: The generated files will be added as dependencies for a -# new custom target with this name. You should add that target as a dependency -# for your main target to ensure these files are built. You can also retrieve -# various properties from this target, such as GENERATED_INCLUDES_DIR, -# BINARY_SCHEMAS_DIR, and COPY_TEXT_SCHEMAS_DIR. +# custom_target_name: The generated files will be added as dependencies for a new custom +# target with this name. You should add that target as a dependency for your main target +# to ensure these files are built. You can also retrieve various properties from this +# target, such as GENERATED_INCLUDES_DIR, BINARY_SCHEMAS_DIR, and COPY_TEXT_SCHEMAS_DIR. # -# additional_dependencies: A list of additional dependencies that you'd like -# all generated files to depend on. Pass in a blank string if you have none. +# additional_dependencies: A list of additional dependencies that you'd like all +# generated files to depend on. Pass in a blank string if you have none. # -# generated_includes_dir: Where to generate the C++ header files for these -# schemas. The generated includes directory will automatically be added to -# CMake's include directories, and will be where generated header files are -# placed. This parameter is optional; pass in empty string if you don't want to -# generate include files for these schemas. +# generated_includes_dir: Where to generate the C++ header files for these schemas. The +# generated includes directory will automatically be added to CMake's include +# directories, and will be where generated header files are placed. This parameter is +# optional; pass in empty string if you don't want to generate include files for these +# schemas. # -# binary_schemas_dir: If you specify an optional binary schema directory, binary -# schemas will be generated for these schemas as well, and placed into the given -# directory. +# binary_schemas_dir: If you specify an optional binary schema directory, binary schemas +# will be generated for these schemas as well, and placed into the given directory. # -# copy_text_schemas_dir: If you want all text schemas (including schemas from -# all schema include directories) copied into a directory (for example, if you -# need them within your project to build JSON files), you can specify that -# folder here. All text schemas will be copied to that folder. +# copy_text_schemas_dir: If you want all text schemas (including schemas from all schema +# include directories) copied into a directory (for example, if you need them within +# your project to build JSON files), you can specify that folder here. All text schemas +# will be copied to that folder. # -# IMPORTANT: Make sure you quote all list arguments you pass to this function! -# Otherwise CMake will only pass in the first element. -# Example: build_flatbuffers("${fb_files}" "${include_dirs}" target_name ...) -function(build_flatbuffers flatbuffers_schemas - schema_include_dirs - custom_target_name - additional_dependencies - generated_includes_dir - binary_schemas_dir - copy_text_schemas_dir) +# IMPORTANT: Make sure you quote all list arguments you pass to this function! Otherwise +# CMake will only pass in the first element. Example: build_flatbuffers("${fb_files}" +# "${include_dirs}" target_name ...) +function( + build_flatbuffers + flatbuffers_schemas + schema_include_dirs + custom_target_name + additional_dependencies + generated_includes_dir + binary_schemas_dir + copy_text_schemas_dir) # Test if including from FindFlatBuffers if(FLATBUFFERS_FLATC_EXECUTABLE) @@ -65,10 +62,7 @@ function(build_flatbuffers flatbuffers_schemas endif() set(FLATC_SCHEMA_ARGS --gen-mutable) if(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS) - set(FLATC_SCHEMA_ARGS - ${FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS} - ${FLATC_SCHEMA_ARGS} - ) + set(FLATC_SCHEMA_ARGS ${FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS} ${FLATC_SCHEMA_ARGS}) endif() set(working_dir "${CMAKE_CURRENT_SOURCE_DIR}") @@ -77,12 +71,12 @@ function(build_flatbuffers flatbuffers_schemas # Generate the include files parameters. set(include_params "") set(all_generated_files "") - foreach (include_dir ${schema_include_dirs}) + foreach(include_dir ${schema_include_dirs}) set(include_params -I ${include_dir} ${include_params}) - if (NOT ${copy_text_schemas_dir} STREQUAL "") + if(NOT ${copy_text_schemas_dir} STREQUAL "") # Copy text schemas from dependent folders. file(GLOB_RECURSE dependent_schemas ${include_dir}/${schema_glob}) - foreach (dependent_schema ${dependent_schemas}) + foreach(dependent_schema ${dependent_schemas}) file(COPY ${dependent_schema} DESTINATION ${copy_text_schemas_dir}) endforeach() endif() @@ -91,62 +85,54 @@ function(build_flatbuffers flatbuffers_schemas foreach(schema ${flatbuffers_schemas}) get_filename_component(filename ${schema} NAME_WE) # For each schema, do the things we requested. - if (NOT ${generated_includes_dir} STREQUAL "") + if(NOT ${generated_includes_dir} STREQUAL "") set(generated_include ${generated_includes_dir}/${filename}_generated.h) add_custom_command( OUTPUT ${generated_include} - COMMAND ${FLATC} ${FLATC_SCHEMA_ARGS} - -o ${generated_includes_dir} - ${include_params} - -c ${schema} + COMMAND ${FLATC} ${FLATC_SCHEMA_ARGS} -o ${generated_includes_dir} + ${include_params} -c ${schema} DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies} WORKING_DIRECTORY "${working_dir}") list(APPEND all_generated_files ${generated_include}) endif() - if (NOT ${binary_schemas_dir} STREQUAL "") + if(NOT ${binary_schemas_dir} STREQUAL "") set(binary_schema ${binary_schemas_dir}/${filename}.bfbs) add_custom_command( OUTPUT ${binary_schema} - COMMAND ${FLATC} -b --schema - -o ${binary_schemas_dir} - ${include_params} - ${schema} + COMMAND ${FLATC} -b --schema -o ${binary_schemas_dir} ${include_params} + ${schema} DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies} WORKING_DIRECTORY "${working_dir}") list(APPEND all_generated_files ${binary_schema}) endif() - if (NOT ${copy_text_schemas_dir} STREQUAL "") + if(NOT ${copy_text_schemas_dir} STREQUAL "") file(COPY ${schema} DESTINATION ${copy_text_schemas_dir}) endif() endforeach() - # Create a custom target that depends on all the generated files. - # This is the target that you can depend on to trigger all these - # to be built. - add_custom_target(${custom_target_name} - DEPENDS ${all_generated_files} ${additional_dependencies}) + # Create a custom target that depends on all the generated files. This is the target + # that you can depend on to trigger all these to be built. + add_custom_target(${custom_target_name} DEPENDS ${all_generated_files} + ${additional_dependencies}) # Register the include directory we are using. - if (NOT ${generated_includes_dir} STREQUAL "") + if(NOT ${generated_includes_dir} STREQUAL "") include_directories(${generated_includes_dir}) - set_property(TARGET ${custom_target_name} - PROPERTY GENERATED_INCLUDES_DIR - ${generated_includes_dir}) + set_property(TARGET ${custom_target_name} PROPERTY GENERATED_INCLUDES_DIR + ${generated_includes_dir}) endif() # Register the binary schemas dir we are using. - if (NOT ${binary_schemas_dir} STREQUAL "") - set_property(TARGET ${custom_target_name} - PROPERTY BINARY_SCHEMAS_DIR - ${binary_schemas_dir}) + if(NOT ${binary_schemas_dir} STREQUAL "") + set_property(TARGET ${custom_target_name} PROPERTY BINARY_SCHEMAS_DIR + ${binary_schemas_dir}) endif() # Register the text schema copy dir we are using. - if (NOT ${copy_text_schemas_dir} STREQUAL "") - set_property(TARGET ${custom_target_name} - PROPERTY COPY_TEXT_SCHEMAS_DIR - ${copy_text_schemas_dir}) + if(NOT ${copy_text_schemas_dir} STREQUAL "") + set_property(TARGET ${custom_target_name} PROPERTY COPY_TEXT_SCHEMAS_DIR + ${copy_text_schemas_dir}) endif() endfunction() diff --git a/cmake/FetchMegBrainVersion.cmake b/cmake/FetchMegBrainVersion.cmake index 0de834ce..80d3f27a 100644 --- a/cmake/FetchMegBrainVersion.cmake +++ b/cmake/FetchMegBrainVersion.cmake @@ -1,49 +1,45 @@ -# Parses the version set in src/core/include/megbrain/version.h -# Exports the following variables: -# MGB_VER_MAJOR: Major version -# MGB_VER_MINOR: Minor version -# MGB_VER_PATCH: Patch version -# MGB_IS_DEV: Is development version -# MGB_VER_STRING: Version string +# Parses the version set in src/core/include/megbrain/version.h Exports the following +# variables: MGB_VER_MAJOR: Major version MGB_VER_MINOR: Minor version MGB_VER_PATCH: +# Patch version MGB_IS_DEV: Is development version MGB_VER_STRING: Version string option(MGB_FORCE_DEV_VERSION "Force -dev tag in version stamp" OFF) -file (READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include/megbrain/version.h" content) +file(READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include/megbrain/version.h" content) -string (REGEX MATCH "MGB_MAJOR +([0-9]+)" _ ${content}) -set (MGB_VER_MAJOR ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGB_MAJOR +([0-9]+)" _ ${content}) +set(MGB_VER_MAJOR ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGB_MINOR +([0-9]+)" _ ${content}) -set (MGB_VER_MINOR ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGB_MINOR +([0-9]+)" _ ${content}) +set(MGB_VER_MINOR ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGB_PATCH *([0-9]+)" _ ${content}) -set (MGB_VER_PATCH ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGB_PATCH *([0-9]+)" _ ${content}) +set(MGB_VER_PATCH ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGE_MAJOR +([0-9]+)" _ ${content}) -set (MGE_VER_MAJOR ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGE_MAJOR +([0-9]+)" _ ${content}) +set(MGE_VER_MAJOR ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGE_MINOR +([0-9]+)" _ ${content}) -set (MGE_VER_MINOR ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGE_MINOR +([0-9]+)" _ ${content}) +set(MGE_VER_MINOR ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGE_PATCH *([0-9]+)" _ ${content}) -set (MGE_VER_PATCH ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGE_PATCH *([0-9]+)" _ ${content}) +set(MGE_VER_PATCH ${CMAKE_MATCH_1}) -string (REGEX MATCH "MGE_EXTRA_NAME *\"(.*)\"" _ ${content}) -set (MGE_EXTRA_NAME ${CMAKE_MATCH_1}) +string(REGEX MATCH "MGE_EXTRA_NAME *\"(.*)\"" _ ${content}) +set(MGE_EXTRA_NAME ${CMAKE_MATCH_1}) -if (MGB_FORCE_DEV_VERSION) - set (MGB_IS_DEV 1) +if(MGB_FORCE_DEV_VERSION) + set(MGB_IS_DEV 1) else() - string (REGEX MATCH "MGB_IS_DEV +([01])" _ ${content}) - set (MGB_IS_DEV ${CMAKE_MATCH_1}) + string(REGEX MATCH "MGB_IS_DEV +([01])" _ ${content}) + set(MGB_IS_DEV ${CMAKE_MATCH_1}) endif() -if (DEFINED MGB_VER_MAJOR) - set (MGB_VER_STRING "${MGB_VER_MAJOR}.${MGB_VER_MINOR}.${MGB_VER_PATCH}") +if(DEFINED MGB_VER_MAJOR) + set(MGB_VER_STRING "${MGB_VER_MAJOR}.${MGB_VER_MINOR}.${MGB_VER_PATCH}") else() - set (MGB_VER_STRING "${MGE_VER_MAJOR}.${MGE_VER_MINOR}.${MGE_VER_PATCH}") + set(MGB_VER_STRING "${MGE_VER_MAJOR}.${MGE_VER_MINOR}.${MGE_VER_PATCH}") endif(DEFINED MGB_VER_MAJOR) -if (MGB_IS_DEV) - set (MGB_VER_STRING "${MGB_VER_STRING}-dev") +if(MGB_IS_DEV) + set(MGB_VER_STRING "${MGB_VER_STRING}-dev") endif() message(STATUS "Building MegBrain ${MGB_VER_STRING}") diff --git a/cmake/Halide.cmake b/cmake/Halide.cmake index 2dc8ecab..760f2384 100644 --- a/cmake/Halide.cmake +++ b/cmake/Halide.cmake @@ -2,31 +2,40 @@ include(ExternalProject) find_package(LLVM 6.0 REQUIRED CONFIG) -STRING(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION}) +string(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION}) list(GET LLVM_VERSION_LIST 0 LLVM_VERSION_MAJOR) list(GET LLVM_VERSION_LIST 1 LLVM_VERSION_MINOR) -set(HALIDE_DIR "${PROJECT_SOURCE_DIR}/third_party/Halide" CACHE STRING "halide directory") +set(HALIDE_DIR + "${PROJECT_SOURCE_DIR}/third_party/Halide" + CACHE STRING "halide directory") set(HALIDE_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/Halide) set(HALIDE_LIB ${HALIDE_BUILD_DIR}/lib/libHalide.a) -ExternalProject_add( - halide - SOURCE_DIR ${HALIDE_DIR} - PREFIX ${HALIDE_BUILD_DIR} - CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${HALIDE_BUILD_DIR} -DWITH_APPS=OFF -DWITH_TESTS=OFF -DWITH_TUTORIALS=OFF -DHALIDE_SHARED_LIBRARY=OFF -DHALIDE_REQUIRE_LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DTARGET_MIPS=OFF -DTARGET_POWERPC=OFF - BUILD_BYPRODUCTS ${HALIDE_LIB} -) +ExternalProject_Add( + halide + SOURCE_DIR ${HALIDE_DIR} + PREFIX ${HALIDE_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} + -DCMAKE_INSTALL_PREFIX=${HALIDE_BUILD_DIR} + -DWITH_APPS=OFF + -DWITH_TESTS=OFF + -DWITH_TUTORIALS=OFF + -DHALIDE_SHARED_LIBRARY=OFF + -DHALIDE_REQUIRE_LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DTARGET_MIPS=OFF + -DTARGET_POWERPC=OFF + BUILD_BYPRODUCTS ${HALIDE_LIB}) set(HALIDE_INC ${HALIDE_BUILD_DIR}/include) file(MAKE_DIRECTORY ${HALIDE_INC}) add_library(libhalide STATIC IMPORTED GLOBAL) add_dependencies(libhalide halide) -set_target_properties( - libhalide PROPERTIES - IMPORTED_LOCATION ${HALIDE_LIB} - INTERFACE_INCLUDE_DIRECTORIES ${HALIDE_INC} -) +set_target_properties(libhalide PROPERTIES IMPORTED_LOCATION ${HALIDE_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${HALIDE_INC}) set(LLVM_COMPONENTS mcjit;bitwriter;linker;passes;X86;ARM;AArch64;Hexagon;NVPTX;AMDGPU) llvm_map_components_to_libnames(HALIDE_LLVM_LIBS ${LLVM_COMPONENTS}) - diff --git a/cmake/MKL_DNN.cmake b/cmake/MKL_DNN.cmake index bb85dc3a..ea2407a2 100644 --- a/cmake/MKL_DNN.cmake +++ b/cmake/MKL_DNN.cmake @@ -1,25 +1,31 @@ -if (MGE_USE_SYSTEM_LIB) - find_package(dnnl) - if (dnnl_FOUND) - message(STATUS "Using system provided MKL-DNN.") - set (MGE_USE_SYSTEM_MKLDNN ON) - return() - endif() +if(MGE_USE_SYSTEM_LIB) + find_package(dnnl) + if(dnnl_FOUND) + message(STATUS "Using system provided MKL-DNN.") + set(MGE_USE_SYSTEM_MKLDNN ON) + return() + endif() endif() option(DNNL_BUILD_TESTS "" OFF) option(DNNL_BUILD_EXAMPLES "" OFF) -# we do not want to use OMP now, so config to CPU mode -# if set to OMP, some dnnl algo will be more fast -set(DNNL_CPU_RUNTIME "SEQ" CACHE STRING "config dnnl to DNNL_RUNTIME_SEQ") +# we do not want to use OMP now, so config to CPU mode if set to OMP, some dnnl algo +# will be more fast +set(DNNL_CPU_RUNTIME + "SEQ" + CACHE STRING "config dnnl to DNNL_RUNTIME_SEQ") if(MGE_BLAS STREQUAL "MKL") - option(_DNNL_USE_MKL "" ON) - set(MKLROOT ${MKL_ROOT_DIR} CACHE STRING "MKL ROOT FOR DNNL") - set(MKLLIB libmkl) + option(_DNNL_USE_MKL "" ON) + set(MKLROOT + ${MKL_ROOT_DIR} + CACHE STRING "MKL ROOT FOR DNNL") + set(MKLLIB libmkl) else() - option(_DNNL_USE_MKL "" OFF) + option(_DNNL_USE_MKL "" OFF) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter -Wno-extra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-extra") -set(DNNL_LIBRARY_TYPE STATIC CACHE STRING "config dnnl to STATIC") +set(DNNL_LIBRARY_TYPE + STATIC + CACHE STRING "config dnnl to STATIC") add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/intel-mkl-dnn) diff --git a/cmake/Modules/FindNumPy.cmake b/cmake/Modules/FindNumPy.cmake index 248f8c21..25c767f7 100644 --- a/cmake/Modules/FindNumPy.cmake +++ b/cmake/Modules/FindNumPy.cmake @@ -1,30 +1,28 @@ -# - Find the NumPy libraries -# This module finds if NumPy is installed, and sets the following variables -# indicating where it is. +# * Find the NumPy libraries This module finds if NumPy is installed, and sets the +# following variables indicating where it is. # # TODO: Update to provide the libraries and paths for linking npymath lib. # -# NUMPY_FOUND - was NumPy found -# NUMPY_VERSION - the version of NumPy found as a string -# NUMPY_VERSION_MAJOR - the major version number of NumPy -# NUMPY_VERSION_MINOR - the minor version number of NumPy -# NUMPY_VERSION_PATCH - the patch version number of NumPy -# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601 -# NUMPY_INCLUDE_DIR - path to the NumPy include files +# NUMPY_FOUND - was NumPy found NUMPY_VERSION - the version of +# NumPy found as a string NUMPY_VERSION_MAJOR - the major version number of NumPy +# NUMPY_VERSION_MINOR - the minor version number of NumPy NUMPY_VERSION_PATCH - +# the patch version number of NumPy NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is +# 10601 NUMPY_INCLUDE_DIR - path to the NumPy include files unset(NUMPY_VERSION) unset(NUMPY_INCLUDE_DIR) if(PYTHONINTERP_FOUND) - execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" - "import numpy as n; print(n.__version__); print(n.get_include());" + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import numpy as n; print(n.__version__); print(n.get_include());" RESULT_VARIABLE __result OUTPUT_VARIABLE __output OUTPUT_STRIP_TRAILING_WHITESPACE) if(__result MATCHES 0) string(REGEX REPLACE ";" "\\\\;" __values ${__output}) - string(REGEX REPLACE "\r?\n" ";" __values ${__values}) + string(REGEX REPLACE "\r?\n" ";" __values ${__values}) list(GET __values 0 NUMPY_VERSION) list(GET __values 1 NUMPY_INCLUDE_DIR) @@ -33,13 +31,18 @@ if(PYTHONINTERP_FOUND) set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1}) set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2}) set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3}) - math(EXPR NUMPY_VERSION_DECIMAL - "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}") - string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR}) + math( + EXPR + NUMPY_VERSION_DECIMAL + "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}" + ) + string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR}) else() - unset(NUMPY_VERSION) - unset(NUMPY_INCLUDE_DIR) - message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n") + unset(NUMPY_VERSION) + unset(NUMPY_INCLUDE_DIR) + message( + STATUS + "Requested NumPy version and include path, but got instead:\n${__output}\n") endif() endif() else() @@ -47,8 +50,10 @@ else() endif() include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION - VERSION_VAR NUMPY_VERSION) +find_package_handle_standard_args( + NumPy + REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION + VERSION_VAR NUMPY_VERSION) if(NUMPY_FOUND) message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})") diff --git a/cmake/OpenBLAS.cmake b/cmake/OpenBLAS.cmake index 3216522f..1562f88a 100644 --- a/cmake/OpenBLAS.cmake +++ b/cmake/OpenBLAS.cmake @@ -1,48 +1,50 @@ -if (MGE_USE_SYSTEM_LIB) - find_package(OpenBLAS) - set (MGE_USE_SYSTEM_OPENBLAS ON) - - message(STATUS "Using system provided OpenBLAS ${OpenBLAS_VERSION}") - add_library(libopenblas IMPORTED GLOBAL) - set_target_properties( - libopenblas PROPERTIES - IMPORTED_LOCATION ${OpenBLAS_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${OpenBLAS_INCLUDE_DIRS} - ) - return() +if(MGE_USE_SYSTEM_LIB) + find_package(OpenBLAS) + set(MGE_USE_SYSTEM_OPENBLAS ON) + + message(STATUS "Using system provided OpenBLAS ${OpenBLAS_VERSION}") + add_library(libopenblas IMPORTED GLOBAL) + set_target_properties( + libopenblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${OpenBLAS_INCLUDE_DIRS}) + return() endif() include(ExternalProject) include(GNUInstallDirs) -set(OPENBLAS_DIR "${PROJECT_SOURCE_DIR}/third_party/OpenBLAS" CACHE STRING "OpenBLAS directory") +set(OPENBLAS_DIR + "${PROJECT_SOURCE_DIR}/third_party/OpenBLAS" + CACHE STRING "OpenBLAS directory") set(OPENBLAS_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/OpenBLAS) set(OPENBLAS_INC ${OPENBLAS_BUILD_DIR}/include) set(OPENBLAS_LIB ${OPENBLAS_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a) if(${CMAKE_GENERATOR} STREQUAL "Ninja") - set(MAKE_COMMAND make) + set(MAKE_COMMAND make) else() - set(MAKE_COMMAND "$(MAKE)") + set(MAKE_COMMAND "$(MAKE)") endif() -ExternalProject_add( - openblas - SOURCE_DIR ${OPENBLAS_DIR} - PREFIX ${OPENBLAS_BUILD_DIR} - CMAKE_GENERATOR "Unix Makefiles" - CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${OPENBLAS_BUILD_DIR} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_POSITION_INDEPENDENT_CODE=ON - BUILD_COMMAND ${MAKE_COMMAND} - BUILD_BYPRODUCTS ${OPENBLAS_LIB} ${OPENBLAS_PROTOC_EXECUTABLE} -) +ExternalProject_Add( + openblas + SOURCE_DIR ${OPENBLAS_DIR} + PREFIX ${OPENBLAS_BUILD_DIR} + CMAKE_GENERATOR "Unix Makefiles" + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${OPENBLAS_BUILD_DIR} + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + BUILD_COMMAND ${MAKE_COMMAND} + BUILD_BYPRODUCTS ${OPENBLAS_LIB} ${OPENBLAS_PROTOC_EXECUTABLE}) file(MAKE_DIRECTORY ${OPENBLAS_INC}) add_library(libopenblas STATIC IMPORTED GLOBAL) add_dependencies(libopenblas openblas) set_target_properties( - libopenblas PROPERTIES - IMPORTED_LOCATION ${OPENBLAS_LIB} - INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_BUILD_DIR}/include -) + libopenblas PROPERTIES IMPORTED_LOCATION ${OPENBLAS_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_BUILD_DIR}/include) diff --git a/cmake/aclrt.cmake b/cmake/aclrt.cmake index d9d3d190..ab2201e5 100644 --- a/cmake/aclrt.cmake +++ b/cmake/aclrt.cmake @@ -1,31 +1,31 @@ -find_library(ACLRT_LIBRARY - NAMES libascendcl.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{ACLRT_HOME}/lib64/stub" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES stub - DOC "ACL library." ) +find_library( + ACLRT_LIBRARY + NAMES libascendcl.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{ACLRT_HOME}/lib64/stub" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES stub + DOC "ACL library.") if(ACLRT_LIBRARY STREQUAL "ACLRT_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find ACLRT Library") + message(FATAL_ERROR "Can not find ACLRT Library") endif() get_filename_component(__found_aclrt_root "${ACLRT_LIBRARY}/../../../" REALPATH) -find_path(ACLRT_INCLUDE_DIR - NAMES acl/acl.h - HINTS "$ENV{ACLRT_HOME}/include" ${__found_aclrt_root} - PATH_SUFFIXES include - DOC "Path to ACLRT include directory." ) +find_path( + ACLRT_INCLUDE_DIR + NAMES acl/acl.h + HINTS "$ENV{ACLRT_HOME}/include" ${__found_aclrt_root} + PATH_SUFFIXES include + DOC "Path to ACLRT include directory.") if(ACLRT_INCLUDE_DIR STREQUAL "ACLRT_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find ACLRT Library") + message(FATAL_ERROR "Can not find ACLRT Library") endif() add_library(libascendcl SHARED IMPORTED) -set_target_properties(libascendcl PROPERTIES - IMPORTED_LOCATION ${ACLRT_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${ACLRT_INCLUDE_DIR} -) +set_target_properties( + libascendcl PROPERTIES IMPORTED_LOCATION ${ACLRT_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${ACLRT_INCLUDE_DIR}) message(STATUS "Found ACLRT: ${__found_aclrt_root}") - diff --git a/cmake/cndev.cmake b/cmake/cndev.cmake index 0b85297f..e9bde247 100644 --- a/cmake/cndev.cmake +++ b/cmake/cndev.cmake @@ -1,44 +1,57 @@ -find_library(CNDEV_LIBRARY - NAMES libcndev.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNDEV library." ) +find_library( + CNDEV_LIBRARY + NAMES libcndev.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNDEV library.") if(CNDEV_LIBRARY STREQUAL "CNDEV_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNDEV Library") + message(FATAL_ERROR "Can not find CNDEV Library") endif() get_filename_component(__found_cndev_root ${CNDEV_LIBRARY}/../.. REALPATH) -find_path(CNDEV_INCLUDE_DIR - NAMES cndev.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cndev_root} - PATH_SUFFIXES include - DOC "Path to CNDEV include directory." ) +find_path( + CNDEV_INCLUDE_DIR + NAMES cndev.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cndev_root} + PATH_SUFFIXES include + DOC "Path to CNDEV include directory.") if(CNDEV_INCLUDE_DIR STREQUAL "CNDEV_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNDEV Library") + message(FATAL_ERROR "Can not find CNDEV Library") endif() -file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_1 REGEX "^#define CNDEV_VERSION_1 [0-9]+.*$") -file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_2 REGEX "^#define CNDEV_VERSION_2 [0-9]+.*$") -file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_3 REGEX "^#define CNDEV_VERSION_3 [0-9]+.*$") -file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_4 REGEX "^#define CNDEV_VERSION_4 [0-9]+.*$") -file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_5 REGEX "^#define CNDEV_VERSION_5 [0-9]+.*$") - -string(REGEX REPLACE "^#define CNDEV_VERSION_1 ([0-9]+).*$" "\\1" CNDEV_VERSION_1 "${CNDEV_1}") -string(REGEX REPLACE "^#define CNDEV_VERSION_2 ([0-9]+).*$" "\\1" CNDEV_VERSION_2 "${CNDEV_2}") -string(REGEX REPLACE "^#define CNDEV_VERSION_3 ([0-9]+).*$" "\\1" CNDEV_VERSION_3 "${CNDEV_3}") -string(REGEX REPLACE "^#define CNDEV_VERSION_4 ([0-9]+).*$" "\\1" CNDEV_VERSION_4 "${CNDEV_4}") -string(REGEX REPLACE "^#define CNDEV_VERSION_5 ([0-9]+).*$" "\\1" CNDEV_VERSION_5 "${CNDEV_5}") -set(CNDEV_VERSION_STRING "${CNDEV_VERSION_1}.${CNDEV_VERSION_2}.${CNDEV_VERSION_3}.${CNDEV_VERSION_4}.${CNDEV_VERSION_5}") +file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_1 + REGEX "^#define CNDEV_VERSION_1 [0-9]+.*$") +file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_2 + REGEX "^#define CNDEV_VERSION_2 [0-9]+.*$") +file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_3 + REGEX "^#define CNDEV_VERSION_3 [0-9]+.*$") +file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_4 + REGEX "^#define CNDEV_VERSION_4 [0-9]+.*$") +file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_5 + REGEX "^#define CNDEV_VERSION_5 [0-9]+.*$") + +string(REGEX REPLACE "^#define CNDEV_VERSION_1 ([0-9]+).*$" "\\1" CNDEV_VERSION_1 + "${CNDEV_1}") +string(REGEX REPLACE "^#define CNDEV_VERSION_2 ([0-9]+).*$" "\\1" CNDEV_VERSION_2 + "${CNDEV_2}") +string(REGEX REPLACE "^#define CNDEV_VERSION_3 ([0-9]+).*$" "\\1" CNDEV_VERSION_3 + "${CNDEV_3}") +string(REGEX REPLACE "^#define CNDEV_VERSION_4 ([0-9]+).*$" "\\1" CNDEV_VERSION_4 + "${CNDEV_4}") +string(REGEX REPLACE "^#define CNDEV_VERSION_5 ([0-9]+).*$" "\\1" CNDEV_VERSION_5 + "${CNDEV_5}") +set(CNDEV_VERSION_STRING + "${CNDEV_VERSION_1}.${CNDEV_VERSION_2}.${CNDEV_VERSION_3}.${CNDEV_VERSION_4}.${CNDEV_VERSION_5}" +) add_library(libcndev SHARED IMPORTED) -set_target_properties(libcndev PROPERTIES - IMPORTED_LOCATION ${CNDEV_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNDEV_INCLUDE_DIR} -) - -message(STATUS "Found CNDEV: ${__found_cndev_root} (found version: ${CNDEV_VERSION_STRING})") +set_target_properties( + libcndev PROPERTIES IMPORTED_LOCATION ${CNDEV_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CNDEV_INCLUDE_DIR}) +message( + STATUS "Found CNDEV: ${__found_cndev_root} (found version: ${CNDEV_VERSION_STRING})") diff --git a/cmake/cnlight.cmake b/cmake/cnlight.cmake index 4c18c1d8..725d913d 100644 --- a/cmake/cnlight.cmake +++ b/cmake/cnlight.cmake @@ -1,40 +1,49 @@ -find_library(CNLIGHT_LIBRARY - NAMES libcnlight.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNLIGHT library." ) +find_library( + CNLIGHT_LIBRARY + NAMES libcnlight.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNLIGHT library.") if(CNLIGHT_LIBRARY STREQUAL "CNLIGHT_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNLIGHT Library") + message(FATAL_ERROR "Can not find CNLIGHT Library") endif() get_filename_component(__found_cnlight_root "${CNLIGHT_LIBRARY}/../.." REALPATH) -find_path(CNLIGHT_INCLUDE_DIR - NAMES cnlight.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnlight_root} - PATH_SUFFIXES include - DOC "Path to CNLIGHT include directory." ) +find_path( + CNLIGHT_INCLUDE_DIR + NAMES cnlight.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnlight_root} + PATH_SUFFIXES include + DOC "Path to CNLIGHT include directory.") if(CNLIGHT_INCLUDE_DIR STREQUAL "CNLIGHT_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNLIGHT Library") + message(FATAL_ERROR "Can not find CNLIGHT Library") endif() -file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MAJOR REGEX "^#define CNLIGHT_MAJOR_VERSION [0-9]+.*$") -file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MINOR REGEX "^#define CNLIGHT_MINOR_VERSION [0-9]+.*$") -file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_PATCH REGEX "^#define CNLIGHT_PATCH_VERSION [0-9]+.*$") - -string(REGEX REPLACE "^#define CNLIGHT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_MAJOR "${CNLIGHT_MAJOR}") -string(REGEX REPLACE "^#define CNLIGHT_MINOR_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_MINOR "${CNLIGHT_MINOR}") -string(REGEX REPLACE "^#define CNLIGHT_PATCH_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_PATCH "${CNLIGHT_PATCH}") -set(CNLIGHT_VERSION_STRING "${CNLIGHT_VERSION_MAJOR}.${CNLIGHT_VERSION_MINOR}.${CNLIGHT_VERSION_PATCH}") +file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MAJOR + REGEX "^#define CNLIGHT_MAJOR_VERSION [0-9]+.*$") +file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MINOR + REGEX "^#define CNLIGHT_MINOR_VERSION [0-9]+.*$") +file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_PATCH + REGEX "^#define CNLIGHT_PATCH_VERSION [0-9]+.*$") + +string(REGEX REPLACE "^#define CNLIGHT_MAJOR_VERSION ([0-9]+).*$" "\\1" + CNLIGHT_VERSION_MAJOR "${CNLIGHT_MAJOR}") +string(REGEX REPLACE "^#define CNLIGHT_MINOR_VERSION ([0-9]+).*$" "\\1" + CNLIGHT_VERSION_MINOR "${CNLIGHT_MINOR}") +string(REGEX REPLACE "^#define CNLIGHT_PATCH_VERSION ([0-9]+).*$" "\\1" + CNLIGHT_VERSION_PATCH "${CNLIGHT_PATCH}") +set(CNLIGHT_VERSION_STRING + "${CNLIGHT_VERSION_MAJOR}.${CNLIGHT_VERSION_MINOR}.${CNLIGHT_VERSION_PATCH}") add_library(libcnlight SHARED IMPORTED) -set_target_properties(libcnlight PROPERTIES - IMPORTED_LOCATION ${CNLIGHT_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNLIGHT_INCLUDE_DIR} -) - -message(STATUS "Found CNLIGHT: ${__found_cnlight_root} (found version: ${CNLIGHT_VERSION_STRING})") +set_target_properties( + libcnlight PROPERTIES IMPORTED_LOCATION ${CNLIGHT_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CNLIGHT_INCLUDE_DIR}) +message( + STATUS + "Found CNLIGHT: ${__found_cnlight_root} (found version: ${CNLIGHT_VERSION_STRING})") diff --git a/cmake/cnml.cmake b/cmake/cnml.cmake index 067572d1..7b0ed901 100644 --- a/cmake/cnml.cmake +++ b/cmake/cnml.cmake @@ -1,40 +1,48 @@ -find_library(CNML_LIBRARY - NAMES libcnml.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNML library." ) +find_library( + CNML_LIBRARY + NAMES libcnml.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNML library.") if(CNML_LIBRARY STREQUAL "CNML_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNML Library") + message(FATAL_ERROR "Can not find CNML Library") endif() get_filename_component(__found_cnml_root "${CNML_LIBRARY}/../.." REALPATH) -find_path(CNML_INCLUDE_DIR - NAMES cnml.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnml_root} - PATH_SUFFIXES include - DOC "Path to CNML include directory." ) +find_path( + CNML_INCLUDE_DIR + NAMES cnml.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnml_root} + PATH_SUFFIXES include + DOC "Path to CNML include directory.") if(CNML_INCLUDE_DIR STREQUAL "CNML_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNML Library") + message(FATAL_ERROR "Can not find CNML Library") endif() -file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MAJOR REGEX "^#define CNML_MAJOR_VERSION [0-9]+.*$") -file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MINOR REGEX "^#define CNML_MINOR_VERSION [0-9]+.*$") -file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_PATCH REGEX "^#define CNML_PATCH_VERSION [0-9]+.*$") - -string(REGEX REPLACE "^#define CNML_MAJOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MAJOR "${CNML_MAJOR}") -string(REGEX REPLACE "^#define CNML_MINOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MINOR "${CNML_MINOR}") -string(REGEX REPLACE "^#define CNML_PATCH_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_PATCH "${CNML_PATCH}") -set(CNML_VERSION_STRING "${CNML_VERSION_MAJOR}.${CNML_VERSION_MINOR}.${CNML_VERSION_PATCH}") +file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MAJOR + REGEX "^#define CNML_MAJOR_VERSION [0-9]+.*$") +file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MINOR + REGEX "^#define CNML_MINOR_VERSION [0-9]+.*$") +file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_PATCH + REGEX "^#define CNML_PATCH_VERSION [0-9]+.*$") + +string(REGEX REPLACE "^#define CNML_MAJOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MAJOR + "${CNML_MAJOR}") +string(REGEX REPLACE "^#define CNML_MINOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MINOR + "${CNML_MINOR}") +string(REGEX REPLACE "^#define CNML_PATCH_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_PATCH + "${CNML_PATCH}") +set(CNML_VERSION_STRING + "${CNML_VERSION_MAJOR}.${CNML_VERSION_MINOR}.${CNML_VERSION_PATCH}") add_library(libcnml SHARED IMPORTED) -set_target_properties(libcnml PROPERTIES - IMPORTED_LOCATION ${CNML_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNML_INCLUDE_DIR} -) - -message(STATUS "Found CNML: ${__found_cnml_root} (found version: ${CNML_VERSION_STRING})") +set_target_properties( + libcnml PROPERTIES IMPORTED_LOCATION ${CNML_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CNML_INCLUDE_DIR}) +message( + STATUS "Found CNML: ${__found_cnml_root} (found version: ${CNML_VERSION_STRING})") diff --git a/cmake/cnnl.cmake b/cmake/cnnl.cmake index 4d6cf973..6822adf8 100644 --- a/cmake/cnnl.cmake +++ b/cmake/cnnl.cmake @@ -1,80 +1,100 @@ -find_library(CNNL_LIBRARY - NAMES libcnnl.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNNL library." ) +find_library( + CNNL_LIBRARY + NAMES libcnnl.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNNL library.") if(CNNL_LIBRARY STREQUAL "CNNL_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNNL Library") + message(FATAL_ERROR "Can not find CNNL Library") endif() get_filename_component(__found_cnnl_root "${CNNL_LIBRARY}/../.." REALPATH) -find_path(CNNL_INCLUDE_DIR - NAMES cnnl.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_root} - PATH_SUFFIXES include - DOC "Path to CNNL include directory." ) +find_path( + CNNL_INCLUDE_DIR + NAMES cnnl.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_root} + PATH_SUFFIXES include + DOC "Path to CNNL include directory.") if(CNNL_INCLUDE_DIR STREQUAL "CNNL_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNNL Library") + message(FATAL_ERROR "Can not find CNNL Library") endif() -file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MAJOR REGEX "^#define CNNL_MAJOR [0-9]+.*$") -file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MINOR REGEX "^#define CNNL_MINOR [0-9]+.*$") -file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_PATCH REGEX "^#define CNNL_PATCHLEVEL [0-9]+.*$") - -string(REGEX REPLACE "^#define CNNL_MAJOR ([0-9]+).*$" "\\1" CNNL_VERSION_MAJOR "${CNNL_MAJOR}") -string(REGEX REPLACE "^#define CNNL_MINOR ([0-9]+).*$" "\\1" CNNL_VERSION_MINOR "${CNNL_MINOR}") -string(REGEX REPLACE "^#define CNNL_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_VERSION_PATCH "${CNNL_PATCH}") -set(CNNL_VERSION_STRING "${CNNL_VERSION_MAJOR}.${CNNL_VERSION_MINOR}.${CNNL_VERSION_PATCH}") +file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MAJOR + REGEX "^#define CNNL_MAJOR [0-9]+.*$") +file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MINOR + REGEX "^#define CNNL_MINOR [0-9]+.*$") +file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_PATCH + REGEX "^#define CNNL_PATCHLEVEL [0-9]+.*$") + +string(REGEX REPLACE "^#define CNNL_MAJOR ([0-9]+).*$" "\\1" CNNL_VERSION_MAJOR + "${CNNL_MAJOR}") +string(REGEX REPLACE "^#define CNNL_MINOR ([0-9]+).*$" "\\1" CNNL_VERSION_MINOR + "${CNNL_MINOR}") +string(REGEX REPLACE "^#define CNNL_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_VERSION_PATCH + "${CNNL_PATCH}") +set(CNNL_VERSION_STRING + "${CNNL_VERSION_MAJOR}.${CNNL_VERSION_MINOR}.${CNNL_VERSION_PATCH}") add_library(libcnnl SHARED IMPORTED) -set_target_properties(libcnnl PROPERTIES - IMPORTED_LOCATION ${CNNL_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNNL_INCLUDE_DIR} -) +set_target_properties( + libcnnl PROPERTIES IMPORTED_LOCATION ${CNNL_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CNNL_INCLUDE_DIR}) -message(STATUS "Found CNNL: ${__found_cnnl_root} (found version: ${CNNL_VERSION_STRING})") +message( + STATUS "Found CNNL: ${__found_cnnl_root} (found version: ${CNNL_VERSION_STRING})") -find_library(CNNL_EXTRA_LIBRARY - NAMES libcnnl_extra.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNNL_EXTRA library." ) +find_library( + CNNL_EXTRA_LIBRARY + NAMES libcnnl_extra.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNNL_EXTRA library.") if(CNNL_EXTRA_LIBRARY STREQUAL "CNNL_EXTRA_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNNL_EXTRA Library") + message(FATAL_ERROR "Can not find CNNL_EXTRA Library") endif() get_filename_component(__found_cnnl_extra_root "${CNNL_EXTRA_LIBRARY}/../.." REALPATH) -find_path(CNNL_EXTRA_INCLUDE_DIR - NAMES cnnl_extra.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_extra_root} - PATH_SUFFIXES include - DOC "Path to CNNL_EXTRA include directory." ) +find_path( + CNNL_EXTRA_INCLUDE_DIR + NAMES cnnl_extra.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_extra_root} + PATH_SUFFIXES include + DOC "Path to CNNL_EXTRA include directory.") if(CNNL_EXTRA_INCLUDE_DIR STREQUAL "CNNL_EXTRA_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNNL_EXTRA Library") + message(FATAL_ERROR "Can not find CNNL_EXTRA Library") endif() -file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MAJOR REGEX "^#define CNNL_EXTRA_MAJOR [0-9]+.*$") -file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MINOR REGEX "^#define CNNL_EXTRA_MINOR [0-9]+.*$") -file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_PATCH REGEX "^#define CNNL_EXTRA_PATCHLEVEL [0-9]+.*$") - -string(REGEX REPLACE "^#define CNNL_EXTRA_MAJOR ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_MAJOR "${CNNL_EXTRA_MAJOR}") -string(REGEX REPLACE "^#define CNNL_EXTRA_MINOR ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_MINOR "${CNNL_EXTRA_MINOR}") -string(REGEX REPLACE "^#define CNNL_EXTRA_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_PATCH "${CNNL_EXTRA_PATCH}") -set(CNNL_EXTRA_VERSION_STRING "${CNNL_EXTRA_VERSION_MAJOR}.${CNNL_EXTRA_VERSION_MINOR}.${CNNL_EXTRA_VERSION_PATCH}") +file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MAJOR + REGEX "^#define CNNL_EXTRA_MAJOR [0-9]+.*$") +file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MINOR + REGEX "^#define CNNL_EXTRA_MINOR [0-9]+.*$") +file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_PATCH + REGEX "^#define CNNL_EXTRA_PATCHLEVEL [0-9]+.*$") + +string(REGEX REPLACE "^#define CNNL_EXTRA_MAJOR ([0-9]+).*$" "\\1" + CNNL_EXTRA_VERSION_MAJOR "${CNNL_EXTRA_MAJOR}") +string(REGEX REPLACE "^#define CNNL_EXTRA_MINOR ([0-9]+).*$" "\\1" + CNNL_EXTRA_VERSION_MINOR "${CNNL_EXTRA_MINOR}") +string(REGEX REPLACE "^#define CNNL_EXTRA_PATCHLEVEL ([0-9]+).*$" "\\1" + CNNL_EXTRA_VERSION_PATCH "${CNNL_EXTRA_PATCH}") +set(CNNL_EXTRA_VERSION_STRING + "${CNNL_EXTRA_VERSION_MAJOR}.${CNNL_EXTRA_VERSION_MINOR}.${CNNL_EXTRA_VERSION_PATCH}" +) add_library(libcnnl_extra SHARED IMPORTED) -set_target_properties(libcnnl_extra PROPERTIES - IMPORTED_LOCATION ${CNNL_EXTRA_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNNL_EXTRA_INCLUDE_DIR} -) - -message(STATUS "Found CNNL_EXTRA: ${__found_cnnl_extra_root} (found version: ${CNNL_EXTRA_VERSION_STRING})") +set_target_properties( + libcnnl_extra PROPERTIES IMPORTED_LOCATION ${CNNL_EXTRA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CNNL_EXTRA_INCLUDE_DIR}) +message( + STATUS + "Found CNNL_EXTRA: ${__found_cnnl_extra_root} (found version: ${CNNL_EXTRA_VERSION_STRING})" +) diff --git a/cmake/cnrt.cmake b/cmake/cnrt.cmake index fec07cce..c4b98756 100644 --- a/cmake/cnrt.cmake +++ b/cmake/cnrt.cmake @@ -1,40 +1,48 @@ -find_library(CNRT_LIBRARY - NAMES libcnrt.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CNRT library." ) +find_library( + CNRT_LIBRARY + NAMES libcnrt.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CNRT library.") if(CNRT_LIBRARY STREQUAL "CNRT_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CNRT Library") + message(FATAL_ERROR "Can not find CNRT Library") endif() get_filename_component(__found_cnrt_root ${CNRT_LIBRARY}/../../ REALPATH) -find_path(CNRT_INCLUDE_DIR - NAMES cnrt.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnrt_root} - PATH_SUFFIXES include - DOC "Path to CNRT include directory." ) +find_path( + CNRT_INCLUDE_DIR + NAMES cnrt.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnrt_root} + PATH_SUFFIXES include + DOC "Path to CNRT include directory.") if(CNRT_INCLUDE_DIR STREQUAL "CNRT_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CNRT Library") + message(FATAL_ERROR "Can not find CNRT Library") endif() -file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MAJOR REGEX "^#define CNRT_MAJOR_VERSION [0-9]+.*$") -file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MINOR REGEX "^#define CNRT_MINOR_VERSION [0-9]+.*$") -file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_PATCH REGEX "^#define CNRT_PATCH_VERSION [0-9]+.*$") - -string(REGEX REPLACE "^#define CNRT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MAJOR "${CNRT_MAJOR}") -string(REGEX REPLACE "^#define CNRT_MINOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MINOR "${CNRT_MINOR}") -string(REGEX REPLACE "^#define CNRT_PATCH_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_PATCH "${CNRT_PATCH}") -set(CNRT_VERSION_STRING "${CNRT_VERSION_MAJOR}.${CNRT_VERSION_MINOR}.${CNRT_VERSION_PATCH}") +file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MAJOR + REGEX "^#define CNRT_MAJOR_VERSION [0-9]+.*$") +file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MINOR + REGEX "^#define CNRT_MINOR_VERSION [0-9]+.*$") +file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_PATCH + REGEX "^#define CNRT_PATCH_VERSION [0-9]+.*$") + +string(REGEX REPLACE "^#define CNRT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MAJOR + "${CNRT_MAJOR}") +string(REGEX REPLACE "^#define CNRT_MINOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MINOR + "${CNRT_MINOR}") +string(REGEX REPLACE "^#define CNRT_PATCH_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_PATCH + "${CNRT_PATCH}") +set(CNRT_VERSION_STRING + "${CNRT_VERSION_MAJOR}.${CNRT_VERSION_MINOR}.${CNRT_VERSION_PATCH}") add_library(libcnrt SHARED IMPORTED) -set_target_properties(libcnrt PROPERTIES - IMPORTED_LOCATION ${CNRT_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CNRT_INCLUDE_DIR} -) - -message(STATUS "Found CNRT: ${__found_cnrt_root} (found version: ${CNRT_VERSION_STRING})") +set_target_properties( + libcnrt PROPERTIES IMPORTED_LOCATION ${CNRT_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CNRT_INCLUDE_DIR}) +message( + STATUS "Found CNRT: ${__found_cnrt_root} (found version: ${CNRT_VERSION_STRING})") diff --git a/cmake/cpp_redis.cmake b/cmake/cpp_redis.cmake index d7b642e4..f4a88758 100644 --- a/cmake/cpp_redis.cmake +++ b/cmake/cpp_redis.cmake @@ -1,2 +1,5 @@ -file(GLOB_RECURSE CPP_REDIS_SRCS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp) -set(CPP_REDIS_INCLUDES ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) \ No newline at end of file +file(GLOB_RECURSE CPP_REDIS_SRCS + ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp + ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp) +set(CPP_REDIS_INCLUDES ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes + ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) diff --git a/cmake/cpuinfo.cmake b/cmake/cpuinfo.cmake index 97647e38..cf220e06 100644 --- a/cmake/cpuinfo.cmake +++ b/cmake/cpuinfo.cmake @@ -1,20 +1,20 @@ -if (MGE_USE_SYSTEM_LIB) - find_package(Cpuinfo) - message(STATUS "Using system provided cpuinfo ${cpuinfo_VERSION}") - add_library(libcpuinfo IMPORTED GLOBAL) - set_target_properties( - libcpuinfo PROPERTIES - IMPORTED_LOCATION ${cpuinfo_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${cpuinfo_INCLUDE_DIRS} - ) - return() +if(MGE_USE_SYSTEM_LIB) + find_package(Cpuinfo) + message(STATUS "Using system provided cpuinfo ${cpuinfo_VERSION}") + add_library(libcpuinfo IMPORTED GLOBAL) + set_target_properties( + libcpuinfo PROPERTIES IMPORTED_LOCATION ${cpuinfo_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${cpuinfo_INCLUDE_DIRS}) + return() endif() -SET(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "Type of cpuinfo library (shared, static, or default) to build") -OPTION(CPUINFO_BUILD_TOOLS "Build command-line tools" OFF) -OPTION(CPUINFO_BUILD_UNIT_TESTS "Build cpuinfo unit tests" OFF) -OPTION(CPUINFO_BUILD_MOCK_TESTS "Build cpuinfo mock tests" OFF) -OPTION(CPUINFO_BUILD_BENCHMARKS "Build cpuinfo micro-benchmarks" OFF) +set(CPUINFO_LIBRARY_TYPE + "static" + CACHE STRING "Type of cpuinfo library (shared, static, or default) to build") +option(CPUINFO_BUILD_TOOLS "Build command-line tools" OFF) +option(CPUINFO_BUILD_UNIT_TESTS "Build cpuinfo unit tests" OFF) +option(CPUINFO_BUILD_MOCK_TESTS "Build cpuinfo mock tests" OFF) +option(CPUINFO_BUILD_BENCHMARKS "Build cpuinfo micro-benchmarks" OFF) include_directories("${PROJECT_SOURCE_DIR}/third_party/cpuinfo/include") -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/cpuinfo ${CMAKE_CURRENT_BINARY_DIR}/cpuinfo EXCLUDE_FROM_ALL) - +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/cpuinfo + ${CMAKE_CURRENT_BINARY_DIR}/cpuinfo EXCLUDE_FROM_ALL) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 9f262c50..d256b8fe 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -1,73 +1,83 @@ find_package(PkgConfig) if(${PkgConfig_FOUND}) - pkg_check_modules(PC_CUDNN QUIET CUDNN) + pkg_check_modules(PC_CUDNN QUIET CUDNN) endif() -if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") - set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) +if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") + set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) endif() if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) - find_library(CUDNN_LIBRARY - NAMES libcudnn_static.a cudnn.lib - PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CUDNN library." ) + find_library( + CUDNN_LIBRARY + NAMES libcudnn_static.a cudnn.lib + PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} + ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CUDNN library.") else() - find_library(CUDNN_LIBRARY - NAMES libcudnn.so libcudnn.dylib cudnn64.dll - PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "CUDNN library." ) + find_library( + CUDNN_LIBRARY + NAMES libcudnn.so libcudnn.dylib cudnn64.dll + PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} + ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CUDNN library.") endif() if(CUDNN_LIBRARY STREQUAL "CUDNN_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find CuDNN Library, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env") + message( + FATAL_ERROR + "Can not find CuDNN Library, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env" + ) endif() get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) -find_path(CUDNN_INCLUDE_DIR - NAMES cudnn.h - HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} - PATH_SUFFIXES include - DOC "Path to CUDNN include directory." ) +find_path( + CUDNN_INCLUDE_DIR + NAMES cudnn.h + HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} + ${__found_cudnn_root} + PATH_SUFFIXES include + DOC "Path to CUDNN include directory.") if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find CuDNN INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env") + message( + FATAL_ERROR + "Can not find CuDNN INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env" + ) endif() if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h) - file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) + file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) else() - file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) endif() -string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" - CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") -string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" - CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") -string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" - CUDNN_MINOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") -string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" - CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") -string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" - CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") -string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" - CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") +string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" CUDNN_MAJOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" CUDNN_MAJOR_VERSION + "${CUDNN_MAJOR_VERSION}") +string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" CUDNN_MINOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" CUDNN_MINOR_VERSION + "${CUDNN_MINOR_VERSION}") +string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" CUDNN_PATCH_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" CUDNN_PATCH_VERSION + "${CUDNN_PATCH_VERSION}") set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}) - - if(MGE_CUDA_USE_STATIC) - add_library(libcudnn STATIC IMPORTED) + add_library(libcudnn STATIC IMPORTED) else() - add_library(libcudnn SHARED IMPORTED) + add_library(libcudnn SHARED IMPORTED) endif() -set_target_properties(libcudnn PROPERTIES - IMPORTED_LOCATION ${CUDNN_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}) +set_target_properties( + libcudnn PROPERTIES IMPORTED_LOCATION ${CUDNN_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CUDNN_INCLUDE_DIR}) message(STATUS "Found CuDNN: ${__found_cudnn_root} (found version: ${CUDNN_VERSION})") diff --git a/cmake/flatbuffers.cmake b/cmake/flatbuffers.cmake index 0f895930..61cb0c0f 100644 --- a/cmake/flatbuffers.cmake +++ b/cmake/flatbuffers.cmake @@ -1,27 +1,47 @@ -if (MGE_USE_SYSTEM_LIB) - find_package(Flatbuffers REQUIRED) - message(STATUS "Using system provided Flatbuffers ${Flatbuffers_VERSION}") - include(cmake/BuildFlatBuffers.cmake) - return() +if(MGE_USE_SYSTEM_LIB) + find_package(Flatbuffers REQUIRED) + message(STATUS "Using system provided Flatbuffers ${Flatbuffers_VERSION}") + include(cmake/BuildFlatBuffers.cmake) + return() endif() if(MSVC OR WIN32) - message(DEBUG "add flags flatc for clang-cl build") - set(FLATC_FLAGS "") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=unknown-argument -Wno-error=c++98-compat -Wno-error=reserved-id-macro") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=sign-conversion -Wno-error=exceptions -Wno-error=argument-outside-range") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=delete-non-virtual-dtor -Wno-error=ignored-attributes -Wno-error=format") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=sign-compare -Wno-error=unused-private-field -Wno-error=braced-scalar-init") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=return-type-c-linkage -Wno-error=invalid-noreturn -Wno-error=c++98-compat-pedantic") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=extra-semi-stmt -Wno-error=missing-prototypes -Wno-error=documentation-unknown-command") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=missing-variable-declarations -Wno-error=nonportable-system-include-path") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=exit-time-destructors -Wno-error=unused-macros -Wno-error=global-constructors") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=switch-enum -Wno-error=missing-noreturn -Wno-error=float-equal") - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0.0") - set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=suggest-override -Wno-error=suggest-destructor-override") - endif() + message(DEBUG "add flags flatc for clang-cl build") + set(FLATC_FLAGS "") + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=unknown-argument -Wno-error=c++98-compat -Wno-error=reserved-id-macro" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=sign-conversion -Wno-error=exceptions -Wno-error=argument-outside-range" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=delete-non-virtual-dtor -Wno-error=ignored-attributes -Wno-error=format" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=sign-compare -Wno-error=unused-private-field -Wno-error=braced-scalar-init" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=return-type-c-linkage -Wno-error=invalid-noreturn -Wno-error=c++98-compat-pedantic" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=extra-semi-stmt -Wno-error=missing-prototypes -Wno-error=documentation-unknown-command" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=missing-variable-declarations -Wno-error=nonportable-system-include-path" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=exit-time-destructors -Wno-error=unused-macros -Wno-error=global-constructors" + ) + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=switch-enum -Wno-error=missing-noreturn -Wno-error=float-equal" + ) + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0.0") + set(FLATC_FLAGS + "${FLATC_FLAGS} -Wno-error=suggest-override -Wno-error=suggest-destructor-override" + ) + endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLATC_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATC_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLATC_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATC_FLAGS}") endif() option(FLATBUFFERS_BUILD_TESTS "" OFF) diff --git a/cmake/gflags.cmake b/cmake/gflags.cmake index 9dbb8035..a645ecdd 100644 --- a/cmake/gflags.cmake +++ b/cmake/gflags.cmake @@ -1 +1,2 @@ -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags ${CMAKE_CURRENT_BINARY_DIR}/gflags) \ No newline at end of file +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags + ${CMAKE_CURRENT_BINARY_DIR}/gflags) diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake index d2be2f35..a3071f8f 100644 --- a/cmake/gtest.cmake +++ b/cmake/gtest.cmake @@ -1,2 +1,2 @@ -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest ${CMAKE_CURRENT_BINARY_DIR}/gtest EXCLUDE_FROM_ALL) - +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest + ${CMAKE_CURRENT_BINARY_DIR}/gtest EXCLUDE_FROM_ALL) diff --git a/cmake/llvm-project.cmake b/cmake/llvm-project.cmake index bbea20d7..ce2599ac 100644 --- a/cmake/llvm-project.cmake +++ b/cmake/llvm-project.cmake @@ -1,88 +1,136 @@ -# - Find the llvm/mlir libraries -# This module finds if llvm/mlir is installed, or build llvm/mlir from source. -# This module sets the following variables. +# * Find the llvm/mlir libraries This module finds if llvm/mlir is installed, or build +# llvm/mlir from source. This module sets the following variables. # -# MLIR_LLVM_INCLUDE_DIR - path to the LLVM/MLIR include files -# MLIR_LLVM_LIBS - path to the LLVM/MLIR libraries +# MLIR_LLVM_INCLUDE_DIR - path to the LLVM/MLIR include files MLIR_LLVM_LIBS - path +# to the LLVM/MLIR libraries # # This module define the following functions. # -# external_tablegen_library - created interface library which depends on tablegen outputs +# external_tablegen_library - created interface library which depends on tablegen +# outputs include(CMakeParseArguments) function(external_tablegen_library) - cmake_parse_arguments( - _RULE - "TESTONLY" - "NAME;TBLGEN" - "SRCS;INCLUDES;OUTS" - ${ARGN} - ) + cmake_parse_arguments(_RULE "TESTONLY" "NAME;TBLGEN" "SRCS;INCLUDES;OUTS" ${ARGN}) - if(_RULE_TESTONLY AND NOT MGE_WITH_TEST) - return() - endif() + if(_RULE_TESTONLY AND NOT MGE_WITH_TEST) + return() + endif() - set(_NAME ${_RULE_NAME}) + set(_NAME ${_RULE_NAME}) - set(LLVM_TARGET_DEFINITIONS ${_RULE_SRCS}) - set(_INCLUDE_DIRS ${_RULE_INCLUDES}) - list(TRANSFORM _INCLUDE_DIRS PREPEND "-I") - set(_OUTPUTS) - while(_RULE_OUTS) - list(GET _RULE_OUTS 0 _COMMAND) - list(REMOVE_AT _RULE_OUTS 0) - list(GET _RULE_OUTS 0 _FILE) - list(REMOVE_AT _RULE_OUTS 0) - tablegen(${_RULE_TBLGEN} ${_FILE} ${_COMMAND} ${_INCLUDE_DIRS}) - list(APPEND _OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${_FILE}) - endwhile() - add_custom_target(${_NAME}_target DEPENDS ${_OUTPUTS}) + set(LLVM_TARGET_DEFINITIONS ${_RULE_SRCS}) + set(_INCLUDE_DIRS ${_RULE_INCLUDES}) + list(TRANSFORM _INCLUDE_DIRS PREPEND "-I") + set(_OUTPUTS) + while(_RULE_OUTS) + list(GET _RULE_OUTS 0 _COMMAND) + list(REMOVE_AT _RULE_OUTS 0) + list(GET _RULE_OUTS 0 _FILE) + list(REMOVE_AT _RULE_OUTS 0) + tablegen(${_RULE_TBLGEN} ${_FILE} ${_COMMAND} ${_INCLUDE_DIRS}) + list(APPEND _OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${_FILE}) + endwhile() + add_custom_target(${_NAME}_target DEPENDS ${_OUTPUTS}) - add_library(${_NAME} INTERFACE) - add_dependencies(${_NAME} ${_NAME}_target) + add_library(${_NAME} INTERFACE) + add_dependencies(${_NAME} ${_NAME}_target) - target_include_directories(${_NAME} INTERFACE - "$") + target_include_directories(${_NAME} INTERFACE "$") - install(TARGETS ${_NAME} EXPORT ${MGE_EXPORT_TARGETS}) + install(TARGETS ${_NAME} EXPORT ${MGE_EXPORT_TARGETS}) endfunction() -set(LLVM_LIBS LLVMCore LLVMSupport LLVMX86CodeGen LLVMOrcJIT LLVMNVPTXCodeGen LLVMNVPTXDesc LLVMNVPTXInfo) -set(MLIR_CORE_LIBS MLIRAnalysis MLIRExecutionEngine MLIRIR MLIRParser MLIRPass MLIRSideEffectInterfaces MLIRTransforms) -set(MLIR_DIALECT_LIBS MLIRAsync MLIRAVX512 MLIRGPU MLIRLLVMAVX512 MLIRNVVMIR MLIROpenACC MLIRPDL MLIRPDLInterp MLIRQuant MLIRROCDLIR MLIRSDBM MLIRShape MLIRSPIRV MLIRStandardOpsTransforms MLIRTosa) -set(MLIR_CONVERSION_LIBS MLIRAffineToStandard MLIRAVX512ToLLVM MLIRGPUToGPURuntimeTransforms MLIRGPUToNVVMTransforms MLIRSCFToStandard) +set(LLVM_LIBS + LLVMCore + LLVMSupport + LLVMX86CodeGen + LLVMOrcJIT + LLVMNVPTXCodeGen + LLVMNVPTXDesc + LLVMNVPTXInfo) +set(MLIR_CORE_LIBS + MLIRAnalysis + MLIRExecutionEngine + MLIRIR + MLIRParser + MLIRPass + MLIRSideEffectInterfaces + MLIRTransforms) +set(MLIR_DIALECT_LIBS + MLIRAsync + MLIRAVX512 + MLIRGPU + MLIRLLVMAVX512 + MLIRNVVMIR + MLIROpenACC + MLIRPDL + MLIRPDLInterp + MLIRQuant + MLIRROCDLIR + MLIRSDBM + MLIRShape + MLIRSPIRV + MLIRStandardOpsTransforms + MLIRTosa) +set(MLIR_CONVERSION_LIBS + MLIRAffineToStandard MLIRAVX512ToLLVM MLIRGPUToGPURuntimeTransforms + MLIRGPUToNVVMTransforms MLIRSCFToStandard) set(MLIR_TRANSLATION_LIBS MLIRTargetLLVMIR MLIRTargetNVVMIR) -set(MLIR_LIBS ${MLIR_CORE_LIBS} ${MLIR_DIALECT_LIBS} ${MLIR_CONVERSION_LIBS} ${MLIR_TRANSLATION_LIBS}) +set(MLIR_LIBS ${MLIR_CORE_LIBS} ${MLIR_DIALECT_LIBS} ${MLIR_CONVERSION_LIBS} + ${MLIR_TRANSLATION_LIBS}) set(MLIR_LLVM_LIBS ${LLVM_LIBS} ${MLIR_LIBS}) function(add_mge_mlir_src_dep llvm_monorepo_path) - set(_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}") - string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) - if(NOT uppercase_CMAKE_BUILD_TYPE MATCHES "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$") - set(CMAKE_BUILD_TYPE "Debug") - endif() - set(_CMAKE_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) + set(_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}") + string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) + if(NOT uppercase_CMAKE_BUILD_TYPE MATCHES + "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$") + set(CMAKE_BUILD_TYPE "Debug") + endif() + set(_CMAKE_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS + OFF + CACHE BOOL "" FORCE) - add_subdirectory("${llvm_monorepo_path}/llvm" ${LLVM_BUILD_DIR} EXCLUDE_FROM_ALL) + add_subdirectory("${llvm_monorepo_path}/llvm" ${LLVM_BUILD_DIR} EXCLUDE_FROM_ALL) - # Reset CMAKE_BUILD_TYPE to its previous setting - set(CMAKE_BUILD_TYPE "${_CMAKE_BUILD_TYPE}" CACHE STRING "Build type" FORCE) - # Reset BUILD_SHARED_LIBS to its previous setting - set(BUILD_SHARED_LIBS ${_CMAKE_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libraries" FORCE) + # Reset CMAKE_BUILD_TYPE to its previous setting + set(CMAKE_BUILD_TYPE + "${_CMAKE_BUILD_TYPE}" + CACHE STRING "Build type" FORCE) + # Reset BUILD_SHARED_LIBS to its previous setting + set(BUILD_SHARED_LIBS + ${_CMAKE_BUILD_SHARED_LIBS} + CACHE BOOL "Build shared libraries" FORCE) endfunction() # llvm build options -set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "" FORCE) -set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "" FORCE) -set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "" FORCE) -set(LLVM_ENABLE_BINDINGS OFF CACHE BOOL "" FORCE) -set(LLVM_INCLUDE_BENCHMARKS OFF CACHE BOOL "" FORCE) -set(LLVM_ENABLE_RTTI ${MGE_ENABLE_RTTI} CACHE BOOL "" FORCE) -set(LLVM_TARGETS_TO_BUILD "X86;NVPTX;AArch64;ARM" CACHE STRING "" FORCE) -set(LLVM_ENABLE_PROJECTS "mlir" CACHE STRING "" FORCE) +set(LLVM_INCLUDE_EXAMPLES + OFF + CACHE BOOL "" FORCE) +set(LLVM_INCLUDE_TESTS + OFF + CACHE BOOL "" FORCE) +set(LLVM_INCLUDE_DOCS + OFF + CACHE BOOL "" FORCE) +set(LLVM_ENABLE_BINDINGS + OFF + CACHE BOOL "" FORCE) +set(LLVM_INCLUDE_BENCHMARKS + OFF + CACHE BOOL "" FORCE) +set(LLVM_ENABLE_RTTI + ${MGE_ENABLE_RTTI} + CACHE BOOL "" FORCE) +set(LLVM_TARGETS_TO_BUILD + "X86;NVPTX;AArch64;ARM" + CACHE STRING "" FORCE) +set(LLVM_ENABLE_PROJECTS + "mlir" + CACHE STRING "" FORCE) set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm) add_mge_mlir_src_dep("third_party/llvm-project") @@ -91,6 +139,5 @@ set(MLIR_LLVM_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/include ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/include ${PROJECT_SOURCE_DIR}/third_party/llvm-project/mlir/include - ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/tools/mlir/include - ) + ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/tools/mlir/include) set(MLIR_TABLEGEN_EXE mlir-tblgen) diff --git a/cmake/magicmind.cmake b/cmake/magicmind.cmake index 0dd3d050..37ae170e 100644 --- a/cmake/magicmind.cmake +++ b/cmake/magicmind.cmake @@ -1,54 +1,64 @@ -find_library(MAGICMIND_LIBRARY - NAMES libmagicmind.so - PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "MAGICMIND library." ) +find_library( + MAGICMIND_LIBRARY + NAMES libmagicmind.so + PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "MAGICMIND library.") if(MAGICMIND_LIBRARY STREQUAL "MAGICMIND_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find MAGICMIND Library") + message(FATAL_ERROR "Can not find MAGICMIND Library") endif() get_filename_component(__found_magicmind_root "${MAGICMIND_LIBRARY}/../../" REALPATH) -find_path(MAGICMIND_INCLUDE_DIR - NAMES common.h - HINTS "$ENV{NEUWARE_HOME}/include" ${__found_magicmind_root} - PATH_SUFFIXES include - DOC "Path to MAGICMIND include directory." ) +find_path( + MAGICMIND_INCLUDE_DIR + NAMES common.h + HINTS "$ENV{NEUWARE_HOME}/include" ${__found_magicmind_root} + PATH_SUFFIXES include + DOC "Path to MAGICMIND include directory.") if(MAGICMIND_INCLUDE_DIR STREQUAL "MAGICMIND_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find MAGICMIND Library") + message(FATAL_ERROR "Can not find MAGICMIND Library") endif() -file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MAJOR REGEX "^#define MM_MAJOR_VERSION [0-9]+.*$") -file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MINOR REGEX "^#define MM_MINOR_VERSION [0-9]+.*$") -file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_PATCH REGEX "^#define MM_PATCH_VERSION [0-9]+.*$") +file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MAJOR + REGEX "^#define MM_MAJOR_VERSION [0-9]+.*$") +file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MINOR + REGEX "^#define MM_MINOR_VERSION [0-9]+.*$") +file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_PATCH + REGEX "^#define MM_PATCH_VERSION [0-9]+.*$") -string(REGEX REPLACE "^#define MM_MAJOR_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_MAJOR "${MAGICMIND_MAJOR}") -string(REGEX REPLACE "^#define MM_MINOR_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_MINOR "${MAGICMIND_MINOR}") -string(REGEX REPLACE "^#define MM_PATCH_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_PATCH "${MAGICMIND_PATCH}") -set(MAGICMIND_VERSION_STRING "${MAGICMIND_VERSION_MAJOR}.${MAGICMIND_VERSION_MINOR}.${MAGICMIND_VERSION_PATCH}") +string(REGEX REPLACE "^#define MM_MAJOR_VERSION ([0-9]+).*$" "\\1" + MAGICMIND_VERSION_MAJOR "${MAGICMIND_MAJOR}") +string(REGEX REPLACE "^#define MM_MINOR_VERSION ([0-9]+).*$" "\\1" + MAGICMIND_VERSION_MINOR "${MAGICMIND_MINOR}") +string(REGEX REPLACE "^#define MM_PATCH_VERSION ([0-9]+).*$" "\\1" + MAGICMIND_VERSION_PATCH "${MAGICMIND_PATCH}") +set(MAGICMIND_VERSION_STRING + "${MAGICMIND_VERSION_MAJOR}.${MAGICMIND_VERSION_MINOR}.${MAGICMIND_VERSION_PATCH}") add_library(libmagicmind SHARED IMPORTED) -set_target_properties(libmagicmind PROPERTIES - IMPORTED_LOCATION ${MAGICMIND_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${MAGICMIND_INCLUDE_DIR} -) +set_target_properties( + libmagicmind PROPERTIES IMPORTED_LOCATION ${MAGICMIND_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${MAGICMIND_INCLUDE_DIR}) -message(STATUS "Found MAGICMIND: ${__found_magicmind_root} (found version: ${MAGICMIND_VERSION_STRING})") +message( + STATUS + "Found MAGICMIND: ${__found_magicmind_root} (found version: ${MAGICMIND_VERSION_STRING})" +) -find_library(MAGICMIND_RUNTIME_LIBRARY - NAMES libmagicmind_runtime.so - PATHS "${__found_magicmind_root}/lib64" - ) +find_library( + MAGICMIND_RUNTIME_LIBRARY + NAMES libmagicmind_runtime.so + PATHS "${__found_magicmind_root}/lib64") if(MAGICMIND_RUNTIME_LIBRARY STREQUAL "MAGICMIND_RUNTIME_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find MAGICMIND_RUNTIME Library") + message(FATAL_ERROR "Can not find MAGICMIND_RUNTIME Library") else() - message(STATUS "Found MAGICMIND_RUNTIME: ${MAGICMIND_RUNTIME_LIBRARY}") + message(STATUS "Found MAGICMIND_RUNTIME: ${MAGICMIND_RUNTIME_LIBRARY}") endif() add_library(libmagicmind_runtime SHARED IMPORTED) -set_target_properties(libmagicmind_runtime PROPERTIES - IMPORTED_LOCATION ${MAGICMIND_RUNTIME_LIBRARY} -) +set_target_properties(libmagicmind_runtime PROPERTIES IMPORTED_LOCATION + ${MAGICMIND_RUNTIME_LIBRARY}) diff --git a/cmake/mkl.cmake b/cmake/mkl.cmake index 8315f583..cbb81c5c 100644 --- a/cmake/mkl.cmake +++ b/cmake/mkl.cmake @@ -1,77 +1,83 @@ -find_path(MKL_ROOT_DIR - include/mkl_cblas.h - PATHS - ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH} - ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}/Library - ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32/Library - ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32 - $ENV{MKLDIR} - /opt/intel/mkl/*/ - /opt/intel/cmkl/*/ - /Library/Frameworks/Intel_MKL.framework/Versions/Current/lib/universal -) +find_path( + MKL_ROOT_DIR include/mkl_cblas.h + PATHS ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH} + ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}/Library + ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32/Library + ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32 + $ENV{MKLDIR} + /opt/intel/mkl/*/ + /opt/intel/cmkl/*/ + /Library/Frameworks/Intel_MKL.framework/Versions/Current/lib/universal) if(${MKL_ROOT_DIR} STREQUAL "MKL_ROOT_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find MKL") + message(FATAL_ERROR "Can not find MKL") endif() message(STATUS "Build with MKL in ${MKL_ROOT_DIR}") -find_path(MKL_INCLUDE_DIR - mkl_cblas.h - PATHS - ${MKL_ROOT_DIR}/include - ${INCLUDE_INSTALL_DIR} -) +find_path(MKL_INCLUDE_DIR mkl_cblas.h PATHS ${MKL_ROOT_DIR}/include + ${INCLUDE_INSTALL_DIR}) option(MGE_MKL_USE_STATIC "Build MegEngine with static MKL" ON) if(MGE_MKL_USE_STATIC) - find_library(MKL_CORE_LIBRARY - NAMES libmkl_core.a mkl_core.lib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + find_library( + MKL_CORE_LIBRARY + NAMES libmkl_core.a mkl_core.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - find_library(MKL_SEQUENTIAL_LIBRARY - NAMES libmkl_sequential.a mkl_sequential.lib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + find_library( + MKL_SEQUENTIAL_LIBRARY + NAMES libmkl_sequential.a mkl_sequential.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - if(${MGE_ARCH} STREQUAL "x86_64") - find_library(MKL_IPL_LIBRARY - NAMES libmkl_intel_ilp64.a mkl_intel_ilp64.lib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - elseif(${MGE_ARCH} STREQUAL "i386") - find_library(MKL_IPL_LIBRARY - NAMES libmkl_intel_32.a mkl_intel_32.lib mkl_intel_c.lib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - endif() + if(${MGE_ARCH} STREQUAL "x86_64") + find_library( + MKL_IPL_LIBRARY + NAMES libmkl_intel_ilp64.a mkl_intel_ilp64.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + elseif(${MGE_ARCH} STREQUAL "i386") + find_library( + MKL_IPL_LIBRARY + NAMES libmkl_intel_32.a mkl_intel_32.lib mkl_intel_c.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + endif() - add_library(libmkl INTERFACE IMPORTED) - if(UNIX AND NOT APPLE) - target_link_libraries(libmkl INTERFACE -Wl,--start-group ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY} -Wl,--end-group) - else() - target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY}) - endif() - target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) + add_library(libmkl INTERFACE IMPORTED) + if(UNIX AND NOT APPLE) + target_link_libraries( + libmkl INTERFACE -Wl,--start-group ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} + ${MKL_IPL_LIBRARY} -Wl,--end-group) + else() + target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} + ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY}) + endif() + target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) else() - find_library(MKL_CORE_LIBRARY - NAMES libmkl_core.so libmkl_core.dylib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + find_library( + MKL_CORE_LIBRARY + NAMES libmkl_core.so libmkl_core.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - find_library(MKL_SEQUENTIAL_LIBRARY - NAMES libmkl_sequential.so libmkl_sequential.dylib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + find_library( + MKL_SEQUENTIAL_LIBRARY + NAMES libmkl_sequential.so libmkl_sequential.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - if(${MGE_ARCH} STREQUAL "x86_64") - find_library(MKL_IPL_LIBRARY - NAMES libmkl_intel_ilp64.so libmkl_intel_ilp64.dylib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - elseif(${MGE_ARCH} STREQUAL "x86_32") - find_library(MKL_IPL_LIBRARY - NAMES libmkl_intel_32.so libmkl_intel_32.dylib - PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) - endif() - target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY}) - target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) + if(${MGE_ARCH} STREQUAL "x86_64") + find_library( + MKL_IPL_LIBRARY + NAMES libmkl_intel_ilp64.so libmkl_intel_ilp64.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + elseif(${MGE_ARCH} STREQUAL "x86_32") + find_library( + MKL_IPL_LIBRARY + NAMES libmkl_intel_32.so libmkl_intel_32.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + endif() + target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} + ${MKL_IPL_LIBRARY}) + target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) endif() if(${MGE_ARCH} STREQUAL "x86_64") - target_compile_definitions(libmkl INTERFACE -DMKL_ILP64) + target_compile_definitions(libmkl INTERFACE -DMKL_ILP64) endif() diff --git a/cmake/protobuf.cmake b/cmake/protobuf.cmake index 5802b25f..6ac0892d 100644 --- a/cmake/protobuf.cmake +++ b/cmake/protobuf.cmake @@ -1,70 +1,83 @@ function(PROTOBUF_GENERATE_CPP_WITH_ROOT SRCS HDRS ROOT_DIR) - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP_WITH_ROOT() called without any proto files") - return() - endif() + if(NOT ARGN) + message( + SEND_ERROR + "Error: PROTOBUF_GENERATE_CPP_WITH_ROOT() called without any proto files") + return() + endif() - set(${SRCS}) - set(${HDRS}) - foreach(FIL ${ARGN}) - set(ABS_FIL ${ROOT_DIR}/${FIL}) - get_filename_component(FIL_WE ${FIL} NAME_WE) - get_filename_component(FIL_DIR ${ABS_FIL} PATH) - file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR}) + set(${SRCS}) + set(${HDRS}) + foreach(FIL ${ARGN}) + set(ABS_FIL ${ROOT_DIR}/${FIL}) + get_filename_component(FIL_WE ${FIL} NAME_WE) + get_filename_component(FIL_DIR ${ABS_FIL} PATH) + file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR}) - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") - list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc" - "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} - ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} -I ${FIL_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS} - DEPENDS ${ABS_FIL} libprotobuf - COMMENT "Running C++ protocol buffer compiler on ${FIL}" - VERBATIM) - endforeach() + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} + -I ${FIL_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS} + DEPENDS ${ABS_FIL} libprotobuf + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM) + endforeach() - set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - set(${HDRS} ${${HDRS}} PARENT_SCOPE) + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} + ${${SRCS}} + PARENT_SCOPE) + set(${HDRS} + ${${HDRS}} + PARENT_SCOPE) endfunction() if(MGE_USE_SYSTEM_LIB) - find_package(Protobuf) - if(Protobuf_FOUND) - add_library(libprotobuf INTERFACE) - target_link_libraries(libprotobuf INTERFACE ${Protobuf_LIBRARIES}) - target_include_directories(libprotobuf INTERFACE ${Protobuf_INCLUDE_DIRS}) - get_filename_component(Protobuf_ROOT ${Protobuf_INCLUDE_DIR} DIRECTORY) - set(PROTOBUF_ROOT ${Protobuf_ROOT}) - set(PROTOBUF_PROTOC_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE}) - set(PROTOBUF_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS}) - return() - endif() + find_package(Protobuf) + if(Protobuf_FOUND) + add_library(libprotobuf INTERFACE) + target_link_libraries(libprotobuf INTERFACE ${Protobuf_LIBRARIES}) + target_include_directories(libprotobuf INTERFACE ${Protobuf_INCLUDE_DIRS}) + get_filename_component(Protobuf_ROOT ${Protobuf_INCLUDE_DIR} DIRECTORY) + set(PROTOBUF_ROOT ${Protobuf_ROOT}) + set(PROTOBUF_PROTOC_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE}) + set(PROTOBUF_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS}) + return() + endif() endif() - include(ExternalProject) include(GNUInstallDirs) -set(PROTOBUF_DIR "${PROJECT_SOURCE_DIR}/third_party/protobuf" CACHE STRING "protobuf directory") +set(PROTOBUF_DIR + "${PROJECT_SOURCE_DIR}/third_party/protobuf" + CACHE STRING "protobuf directory") set(PROTOBUF_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/protobuf) if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobufd.a) + set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobufd.a) else() - set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobuf.a) + set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobuf.a) endif() set(PROTOBUF_PROTOC_EXECUTABLE ${PROTOBUF_BUILD_DIR}/bin/protoc) -ExternalProject_add( - protobuf - SOURCE_DIR ${PROTOBUF_DIR}/cmake - PREFIX ${PROTOBUF_BUILD_DIR} - CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_BUILD_DIR} -Dprotobuf_BUILD_EXAMPLES=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON - BUILD_BYPRODUCTS ${PROTOBUF_LIB} ${PROTOBUF_PROTOC_EXECUTABLE} -) +ExternalProject_Add( + protobuf + SOURCE_DIR ${PROTOBUF_DIR}/cmake + PREFIX ${PROTOBUF_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_BUILD_DIR} + -Dprotobuf_BUILD_EXAMPLES=OFF + -Dprotobuf_BUILD_TESTS=OFF + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + BUILD_BYPRODUCTS ${PROTOBUF_LIB} ${PROTOBUF_PROTOC_EXECUTABLE}) set(PROTOBUF_INC ${PROTOBUF_BUILD_DIR}/include) file(MAKE_DIRECTORY ${PROTOBUF_INC}) @@ -72,19 +85,14 @@ file(MAKE_DIRECTORY ${PROTOBUF_INC}) add_library(libprotobuf STATIC IMPORTED GLOBAL) add_dependencies(libprotobuf protobuf) set_target_properties( - libprotobuf PROPERTIES - IMPORTED_LOCATION ${PROTOBUF_LIB} - INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_BUILD_DIR}/include -) + libprotobuf PROPERTIES IMPORTED_LOCATION ${PROTOBUF_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_BUILD_DIR}/include) add_executable(protoc IMPORTED GLOBAL) add_dependencies(protoc protobuf) -set_target_properties( - protoc PROPERTIES - IMPORTED_LOCATION ${PROTOBUF_BUILD_DIR}/bin/protoc -) +set_target_properties(protoc PROPERTIES IMPORTED_LOCATION + ${PROTOBUF_BUILD_DIR}/bin/protoc) set(PROTOBUF_ROOT ${PROTOBUF_BUILD_DIR}) set(PROTOBUF_PROTOC_EXECUTABLE protoc) set(PROTOBUF_INCLUDE_DIRS ${PROTOBUF_BUILD_DIR}/include) - diff --git a/cmake/rocm.cmake b/cmake/rocm.cmake index 3bd5897a..b48f1d3c 100644 --- a/cmake/rocm.cmake +++ b/cmake/rocm.cmake @@ -1,28 +1,34 @@ if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH + "/opt/rocm/hip" + CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH + $ENV{HIP_PATH} + CACHE PATH "Path to which HIP has been installed") + endif() endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP QUIET) -if (HIP_FOUND) - message(STATUS "Found HIP: " ${HIP_VERSION}) +if(HIP_FOUND) + message(STATUS "Found HIP: " ${HIP_VERSION}) else() - message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.") + message( + FATAL_ERROR + "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location." + ) endif() -if (${HIP_VERSION} VERSION_LESS 3.0) - message(FATAL_ERROR "ROCM version needed 3. Please update ROCM.") +if(${HIP_VERSION} VERSION_LESS 3.0) + message(FATAL_ERROR "ROCM version needed 3. Please update ROCM.") endif() macro(hipconfig_get_option variable option) - if(NOT DEFINED ${variable}) - execute_process( - COMMAND ${HIP_HIPCONFIG_EXECUTABLE} ${option} - OUTPUT_VARIABLE ${variable}) - endif() + if(NOT DEFINED ${variable}) + execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} ${option} + OUTPUT_VARIABLE ${variable}) + endif() endmacro() hipconfig_get_option(HIP_COMPILER "--compiler") @@ -31,30 +37,33 @@ hipconfig_get_option(HIP_CPP_CONFIG "--cpp_config") separate_arguments(HIP_CPP_CONFIG) foreach(hip_config_item ${HIP_CPP_CONFIG}) - foreach(macro_name "__HIP_PLATFORM_HCC__" "__HIP_ROCclr__") - if(${hip_config_item} STREQUAL "-D${macro_name}=") - set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name}\n") - set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\ + foreach(macro_name "__HIP_PLATFORM_HCC__" "__HIP_ROCclr__") + if(${hip_config_item} STREQUAL "-D${macro_name}=") + set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name}\n") + set(HIP_CPP_UNDEFINE + "${HIP_CPP_UNDEFINE}\ #ifdef ${macro_name}\n#undef ${macro_name}\n\ #else\n#error\n\ #endif\n") - elseif(${hip_config_item} STREQUAL "-D${macro_name}") - set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name} 1\n") - set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\ + elseif(${hip_config_item} STREQUAL "-D${macro_name}") + set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name} 1\n") + set(HIP_CPP_UNDEFINE + "${HIP_CPP_UNDEFINE}\ #ifdef ${macro_name}\n#undef ${macro_name}\n\ #else\n#error\n\ #endif\n") - endif() - endforeach() + endif() + endforeach() endforeach() message(STATUS "Using HIP compiler ${HIP_COMPILER}") if(${HIP_COMPILER} STREQUAL "hcc") - set(MGE_ROCM_LIBS hip_hcc) - message(WARNING "hcc is not well supported, please modify link.txt to link with hipcc") -elseif (${HIP_COMPILER} STREQUAL "clang") - set(MGE_ROCM_LIBS amdhip64) + set(MGE_ROCM_LIBS hip_hcc) + message( + WARNING "hcc is not well supported, please modify link.txt to link with hipcc") +elseif(${HIP_COMPILER} STREQUAL "clang") + set(MGE_ROCM_LIBS amdhip64) endif() list(APPEND MGE_ROCM_LIBS amdocl64 MIOpen rocblas rocrand) @@ -63,26 +72,28 @@ set(HIP_INCLUDE_DIR ${HIP_ROOT_DIR}/../include) set(HIP_LIBRARY_DIR ${HIP_ROOT_DIR}/../lib) function(find_rocm_library name dirname include library) - find_path(${name}_LIBRARY_DIR - NAMES ${library} - HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}" - PATH_SUFFIXES lib lib/x86_64 - DOC "Path to ${name} library directory") + find_path( + ${name}_LIBRARY_DIR + NAMES ${library} + HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}" + PATH_SUFFIXES lib lib/x86_64 + DOC "Path to ${name} library directory") - if(${${name}_LIBRARY_DIR} MATCHES "NOTFOUND$") - message(FATAL_ERROR "Can not find ${name} library") - endif() + if(${${name}_LIBRARY_DIR} MATCHES "NOTFOUND$") + message(FATAL_ERROR "Can not find ${name} library") + endif() - find_path(${name}_INCLUDE_DIR - NAMES ${include} - HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}" - PATH_SUFFIXES include - DOC "Path to ${name} include directory") + find_path( + ${name}_INCLUDE_DIR + NAMES ${include} + HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}" + PATH_SUFFIXES include + DOC "Path to ${name} include directory") - if(${name}_INCLUDE_DIR MATCHES "NOTFOUND$") - message(FATAL_ERROR "Can not find ${name} include") - endif() - message(DEBUG "Found lib ${${name}_LIBRARY_DIR}, include ${${name}_INCLUDE_DIR}") + if(${name}_INCLUDE_DIR MATCHES "NOTFOUND$") + message(FATAL_ERROR "Can not find ${name} include") + endif() + message(DEBUG "Found lib ${${name}_LIBRARY_DIR}, include ${${name}_INCLUDE_DIR}") endfunction() find_rocm_library(MIOPEN miopen miopen libMIOpen.so) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 53f0f433..9bcba8d6 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -1,166 +1,189 @@ -if("${TRT_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{TRT_ROOT_DIR}" STREQUAL "") - set(TRT_ROOT_DIR $ENV{TRT_ROOT_DIR}) +if("${TRT_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{TRT_ROOT_DIR}" STREQUAL "") + set(TRT_ROOT_DIR $ENV{TRT_ROOT_DIR}) endif() if(MGE_CUDA_USE_STATIC) - find_library(TRT_LIBRARY - NAMES libnvinfer_static.a nvinfer.lib - PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "TRT library." ) - find_library(TRT_PLUGIN_LIBRARY - NAMES libnvinfer_plugin_static.a nvinfer_plugin.lib - PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "TRT plugin library." ) + find_library( + TRT_LIBRARY + NAMES libnvinfer_static.a nvinfer.lib + PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT library.") + find_library( + TRT_PLUGIN_LIBRARY + NAMES libnvinfer_plugin_static.a nvinfer_plugin.lib + PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT plugin library.") else() - find_library(TRT_LIBRARY - NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll - PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "TRT library." ) - find_library(TRT_PLUGIN_LIBRARY - NAMES libnvinfer_plugin.so libnvinfer_plugin.dylib nvinfer_plugin.dll - PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} - HINTS ${ALTER_LIBRARY_PATHS} - PATH_SUFFIXES lib lib64 - DOC "TRT plugin library." ) + find_library( + TRT_LIBRARY + NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll + PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT library.") + find_library( + TRT_PLUGIN_LIBRARY + NAMES libnvinfer_plugin.so libnvinfer_plugin.dylib nvinfer_plugin.dll + PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${ALTER_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT plugin library.") endif() if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find TensorRT Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env") + message( + FATAL_ERROR + "Can not find TensorRT Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env" + ) endif() if(TRT_PLUGIN_LIBRARY STREQUAL "TRT_PLUGIN_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find TensorRT Plugin Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env") + message( + FATAL_ERROR + "Can not find TensorRT Plugin Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env" + ) endif() get_filename_component(__found_trt_root ${TRT_LIBRARY}/../.. REALPATH) -find_path(TRT_INCLUDE_DIR - NAMES NvInfer.h - HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root} - PATH_SUFFIXES include - DOC "Path to TRT include directory." ) -find_path(TRT_PLUGIN_INCLUDE_DIR - NAMES NvInferPlugin.h - HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root} - PATH_SUFFIXES include - DOC "Path to TRT plugin include directory." ) +find_path( + TRT_INCLUDE_DIR + NAMES NvInfer.h + HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root} + PATH_SUFFIXES include + DOC "Path to TRT include directory.") +find_path( + TRT_PLUGIN_INCLUDE_DIR + NAMES NvInferPlugin.h + HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root} + PATH_SUFFIXES include + DOC "Path to TRT plugin include directory.") if(TRT_INCLUDE_DIR STREQUAL "TRT_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find TensorRT INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env") + message( + FATAL_ERROR + "Can not find TensorRT INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env" + ) endif() if(TRT_PLUGIN_INCLUDE_DIR STREQUAL "TRT_PLUGIN_INCLUDE_DIR-NOTFOUND") - message(FATAL_ERROR "Can not find TensorRT Plugin INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env") + message( + FATAL_ERROR + "Can not find TensorRT Plugin INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env" + ) endif() -file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") -file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") -file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR + REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR + REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH + REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") -if (TensorRT_MAJOR STREQUAL "") - file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") - file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") - file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") +if(TensorRT_MAJOR STREQUAL "") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR + REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR + REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH + REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") endif() -string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") -string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") -string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") -set(TRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") +string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" + TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") +string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" + TensorRT_VERSION_MINOR "${TensorRT_MINOR}") +string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" + TensorRT_VERSION_PATCH "${TensorRT_PATCH}") +set(TRT_VERSION_STRING + "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") if(MGE_CUDA_USE_STATIC) - add_library(libnvinfer STATIC IMPORTED) - add_library(libnvinfer_plugin STATIC IMPORTED) + add_library(libnvinfer STATIC IMPORTED) + add_library(libnvinfer_plugin STATIC IMPORTED) else() - add_library(libnvinfer SHARED IMPORTED) - add_library(libnvinfer_plugin SHARED IMPORTED) + add_library(libnvinfer SHARED IMPORTED) + add_library(libnvinfer_plugin SHARED IMPORTED) endif() -set_target_properties(libnvinfer PROPERTIES - IMPORTED_LOCATION ${TRT_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${TRT_INCLUDE_DIR} -) -set_target_properties(libnvinfer_plugin PROPERTIES - IMPORTED_LOCATION ${TRT_PLUGIN_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${TRT_PLUGIN_INCLUDE_DIR} -) +set_target_properties( + libnvinfer PROPERTIES IMPORTED_LOCATION ${TRT_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${TRT_INCLUDE_DIR}) +set_target_properties( + libnvinfer_plugin PROPERTIES IMPORTED_LOCATION ${TRT_PLUGIN_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${TRT_PLUGIN_INCLUDE_DIR}) -message(STATUS "Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})") +message( + STATUS "Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})") if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) - if(MGE_CUDA_USE_STATIC) - find_library(LIBMYELIN_COMPILER - NAMES libmyelin_compiler_static.a myelin_compiler_static.lib - PATHS ${__found_trt_root}/lib - ) - if(LIBMYELIN_COMPILER STREQUAL "LIBMYELIN_COMPILER-NOTFOUND") - message(FATAL_ERROR "Can not find LIBMYELIN_COMPILER Library") - else() - message(STATUS "Found TensorRT myelin_compiler: ${LIBMYELIN_COMPILER}") - endif() - add_library(libmyelin_compiler STATIC IMPORTED) - set_target_properties(libmyelin_compiler PROPERTIES - IMPORTED_LOCATION ${LIBMYELIN_COMPILER} - ) + if(MGE_CUDA_USE_STATIC) + find_library( + LIBMYELIN_COMPILER + NAMES libmyelin_compiler_static.a myelin_compiler_static.lib + PATHS ${__found_trt_root}/lib) + if(LIBMYELIN_COMPILER STREQUAL "LIBMYELIN_COMPILER-NOTFOUND") + message(FATAL_ERROR "Can not find LIBMYELIN_COMPILER Library") + else() + message(STATUS "Found TensorRT myelin_compiler: ${LIBMYELIN_COMPILER}") + endif() + add_library(libmyelin_compiler STATIC IMPORTED) + set_target_properties(libmyelin_compiler PROPERTIES IMPORTED_LOCATION + ${LIBMYELIN_COMPILER}) - find_library(LIBMYELIN_EXECUTOR - NAMES libmyelin_executor_static.a myelin_executor_static.lib - PATHS ${__found_trt_root}/lib - ) - if(LIBMYELIN_EXECUTOR STREQUAL "LIBMYELIN_EXECUTOR-NOTFOUND") - message(FATAL_ERROR "Can not find LIBMYELIN_EXECUTOR Library") - else() - message(STATUS "Found TensorRT libmyelin_executor: ${LIBMYELIN_EXECUTOR}") - endif() - add_library(libmyelin_executor STATIC IMPORTED) - set_target_properties(libmyelin_executor PROPERTIES - IMPORTED_LOCATION ${LIBMYELIN_EXECUTOR} - ) + find_library( + LIBMYELIN_EXECUTOR + NAMES libmyelin_executor_static.a myelin_executor_static.lib + PATHS ${__found_trt_root}/lib) + if(LIBMYELIN_EXECUTOR STREQUAL "LIBMYELIN_EXECUTOR-NOTFOUND") + message(FATAL_ERROR "Can not find LIBMYELIN_EXECUTOR Library") + else() + message(STATUS "Found TensorRT libmyelin_executor: ${LIBMYELIN_EXECUTOR}") + endif() + add_library(libmyelin_executor STATIC IMPORTED) + set_target_properties(libmyelin_executor PROPERTIES IMPORTED_LOCATION + ${LIBMYELIN_EXECUTOR}) - find_library(LIBMYELIN_PATTERN_RUNTIME - NAMES libmyelin_pattern_runtime_static.a myelin_pattern_runtime_static.lib - PATHS ${__found_trt_root}/lib - ) - if(LIBMYELIN_PATTERN_RUNTIME STREQUAL "LIBMYELIN_PATTERN_RUNTIME-NOTFOUND") - message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_RUNTIME Library") - else() - message(STATUS "Found TensorRT libmyelin_pattern_runtime: ${LIBMYELIN_PATTERN_RUNTIME}") - endif() - add_library(libmyelin_pattern_runtime STATIC IMPORTED) - set_target_properties(libmyelin_pattern_runtime PROPERTIES - IMPORTED_LOCATION ${LIBMYELIN_PATTERN_RUNTIME} - ) + find_library( + LIBMYELIN_PATTERN_RUNTIME + NAMES libmyelin_pattern_runtime_static.a myelin_pattern_runtime_static.lib + PATHS ${__found_trt_root}/lib) + if(LIBMYELIN_PATTERN_RUNTIME STREQUAL "LIBMYELIN_PATTERN_RUNTIME-NOTFOUND") + message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_RUNTIME Library") + else() + message( + STATUS "Found TensorRT libmyelin_pattern_runtime: ${LIBMYELIN_PATTERN_RUNTIME}") + endif() + add_library(libmyelin_pattern_runtime STATIC IMPORTED) + set_target_properties(libmyelin_pattern_runtime + PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_PATTERN_RUNTIME}) - find_library(LIBMYELIN_PATTERN_LIBRARY - NAMES libmyelin_pattern_library_static.a myelin_pattern_library_static.lib - PATHS ${__found_trt_root}/lib - ) - if(LIBMYELIN_PATTERN_LIBRARY STREQUAL "LIBMYELIN_PATTERN_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_LIBRARY Library") - else() - message(STATUS "Found TensorRT libmyelin_pattern_library: ${LIBMYELIN_PATTERN_LIBRARY}") - endif() - add_library(libmyelin_pattern_library STATIC IMPORTED) - set_target_properties(libmyelin_pattern_library PROPERTIES - IMPORTED_LOCATION ${LIBMYELIN_PATTERN_LIBRARY} - ) + find_library( + LIBMYELIN_PATTERN_LIBRARY + NAMES libmyelin_pattern_library_static.a myelin_pattern_library_static.lib + PATHS ${__found_trt_root}/lib) + if(LIBMYELIN_PATTERN_LIBRARY STREQUAL "LIBMYELIN_PATTERN_LIBRARY-NOTFOUND") + message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_LIBRARY Library") else() - find_library(LIBMYELIN_SHARED - NAMES libmyelin.so myelin.dll - PATHS ${__found_trt_root}/lib - ) + message( + STATUS "Found TensorRT libmyelin_pattern_library: ${LIBMYELIN_PATTERN_LIBRARY}") + endif() + add_library(libmyelin_pattern_library STATIC IMPORTED) + set_target_properties(libmyelin_pattern_library + PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_PATTERN_LIBRARY}) + else() + find_library( + LIBMYELIN_SHARED + NAMES libmyelin.so myelin.dll + PATHS ${__found_trt_root}/lib) - if(LIBMYELIN_SHARED STREQUAL "LIBMYELIN_SHARED-NOTFOUND") - message(FATAL_ERROR "Can not find LIBMYELIN_SHARED Library") - else() - message(STATUS "Found TensorRT libmyelin_shared: ${LIBMYELIN_SHARED}") - endif() - add_library(libmyelin SHARED IMPORTED) - set_target_properties(libmyelin PROPERTIES - IMPORTED_LOCATION ${LIBMYELIN_SHARED} - ) + if(LIBMYELIN_SHARED STREQUAL "LIBMYELIN_SHARED-NOTFOUND") + message(FATAL_ERROR "Can not find LIBMYELIN_SHARED Library") + else() + message(STATUS "Found TensorRT libmyelin_shared: ${LIBMYELIN_SHARED}") endif() + add_library(libmyelin SHARED IMPORTED) + set_target_properties(libmyelin PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_SHARED}) + endif() endif() diff --git a/cmake/zmq.cmake b/cmake/zmq.cmake index d4677553..71f40cd2 100644 --- a/cmake/zmq.cmake +++ b/cmake/zmq.cmake @@ -1,17 +1,26 @@ include(ExternalProject) include(GNUInstallDirs) -set(ZMQ_DIR ${PROJECT_SOURCE_DIR}/third_party/libzmq CACHE STRING "ZMQ directory") +set(ZMQ_DIR + ${PROJECT_SOURCE_DIR}/third_party/libzmq + CACHE STRING "ZMQ directory") set(ZMQ_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/libzmq) set(ZMQ_LIB ${ZMQ_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libzmq.a) -ExternalProject_add( - zmq - SOURCE_DIR ${ZMQ_DIR} - PREFIX ${ZMQ_BUILD_DIR} - CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${ZMQ_BUILD_DIR} -DWITH_PERF_TOOL=OFF -DZMQ_BUILD_TESTS=OFF -DENABLE_CPACK=OFF -DENABLE_CURVE=OFF - BUILD_BYPRODUCTS ${ZMQ_LIB} -) +ExternalProject_Add( + zmq + SOURCE_DIR ${ZMQ_DIR} + PREFIX ${ZMQ_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} + -DCMAKE_INSTALL_PREFIX=${ZMQ_BUILD_DIR} + -DWITH_PERF_TOOL=OFF + -DZMQ_BUILD_TESTS=OFF + -DENABLE_CPACK=OFF + -DENABLE_CURVE=OFF + BUILD_BYPRODUCTS ${ZMQ_LIB}) set(ZMQ_INC ${ZMQ_BUILD_DIR}/include) include_directories(${ZMQ_INC}) @@ -19,8 +28,5 @@ file(MAKE_DIRECTORY ${ZMQ_INC}) add_library(libzmq STATIC IMPORTED GLOBAL) add_dependencies(libzmq zmq) -set_target_properties( - libzmq PROPERTIES - IMPORTED_LOCATION ${ZMQ_LIB} - INTERFACE_INCLUDE_DIRECTORIES ${ZMQ_INC} -) +set_target_properties(libzmq PROPERTIES IMPORTED_LOCATION ${ZMQ_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${ZMQ_INC}) diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt index 6270da98..dfa4a97a 100644 --- a/dnn/CMakeLists.txt +++ b/dnn/CMakeLists.txt @@ -4,66 +4,61 @@ set(OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/scripts/gen_param_defs.py) set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/include/) file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/megdnn) add_custom_command( - OUTPUT - ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h - ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} - ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} - tmp_unuse.log --write-cppjson ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h - DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} - VERBATIM -) - -list(APPEND OPR_PARAM_DEFS_OUTS - ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + OUTPUT ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h + COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} + ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + COMMAND + ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} + ${OPR_PARAM_DEFS_SRCS} tmp_unuse.log --write-cppjson ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h -) + DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} + VERBATIM) + +list(APPEND OPR_PARAM_DEFS_OUTS ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h) list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR}) set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/src/common) add_custom_command( - OUTPUT - ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} - --enumv ${OPR_PARAM_DEFS_SRCS} - ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh - DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} - VERBATIM -) + OUTPUT ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh + COMMAND + ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} --enumv + ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh + DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} + VERBATIM) list(APPEND OPR_PARAM_DEFS_OUTS - ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh -) + ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh) list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR}) -install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h") +install( + DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING + PATTERN "*.h") add_custom_target(_opr_param_defs DEPENDS ${OPR_PARAM_DEFS_OUTS}) add_library(opr_param_defs INTERFACE) target_include_directories(opr_param_defs - INTERFACE - $ -) -foreach (INCPATH IN LISTS OPR_PARAM_DEFS_INC) - target_include_directories(opr_param_defs - INTERFACE $ - ) + INTERFACE $) +foreach(INCPATH IN LISTS OPR_PARAM_DEFS_INC) + target_include_directories(opr_param_defs INTERFACE $) endforeach() add_dependencies(opr_param_defs _opr_param_defs) install(TARGETS opr_param_defs EXPORT ${MGE_EXPORT_TARGETS}) if(MGE_WITH_CUDA) - add_library(cutlass INTERFACE) - target_include_directories(cutlass - INTERFACE - $) + add_library(cutlass INTERFACE) + target_include_directories( + cutlass + INTERFACE $) endif() if(MGE_WITH_TEST) - add_subdirectory(test) + add_subdirectory(test) endif() add_subdirectory(src) diff --git a/dnn/atlas-stub/CMakeLists.txt b/dnn/atlas-stub/CMakeLists.txt index f6bffb1a..7be656a4 100644 --- a/dnn/atlas-stub/CMakeLists.txt +++ b/dnn/atlas-stub/CMakeLists.txt @@ -1,6 +1,8 @@ add_library(atlas-stub STATIC src/libatlas-wrap.cpp) -target_include_directories(atlas-stub PUBLIC $) +target_include_directories( + atlas-stub PUBLIC $) install(TARGETS atlas-stub EXPORT ${MGE_EXPORT_TARGETS}) add_library(acl-cblas STATIC src/libacl_cblas-wrap.cpp) -target_include_directories(acl-cblas PUBLIC $) +target_include_directories( + acl-cblas PUBLIC $) diff --git a/dnn/cuda-stub/CMakeLists.txt b/dnn/cuda-stub/CMakeLists.txt index e89dddbd..8d71e835 100644 --- a/dnn/cuda-stub/CMakeLists.txt +++ b/dnn/cuda-stub/CMakeLists.txt @@ -1,26 +1,27 @@ -file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp) -file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) +file(GLOB_RECURSE CUDA_STUB src/libcuda.cpp) +file(GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) if(MGE_WITH_CUDA_STUB) - list(APPEND STUB_SRC ${CUDA_STUB}) + list(APPEND STUB_SRC ${CUDA_STUB}) endif() if(MGE_WITH_NVRTC_STUB) - list(APPEND STUB_SRC ${NVRTC_STUB}) + list(APPEND STUB_SRC ${NVRTC_STUB}) endif() if(MSVC OR WIN32) - add_library (cuda-stub STATIC ${STUB_SRC}) + add_library(cuda-stub STATIC ${STUB_SRC}) else() - add_library (cuda-stub SHARED ${STUB_SRC}) + add_library(cuda-stub SHARED ${STUB_SRC}) endif() set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub) target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) -if (MSVC OR WIN32) - target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) +if(MSVC OR WIN32) + target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) else() - target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) + target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) endif() -target_include_directories(cuda-stub PRIVATE $) -install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) +target_include_directories(cuda-stub + PRIVATE $) +install(TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) diff --git a/dnn/include/megdnn/common.h b/dnn/include/megdnn/common.h index f0073c59..09cb70a0 100644 --- a/dnn/include/megdnn/common.h +++ b/dnn/include/megdnn/common.h @@ -12,6 +12,7 @@ #pragma once #include "megbrain_build_config.h" +#include "megdnn/oprs/base.h" #if MGB_ENABLE_GETENV #define MGB_GETENV ::std::getenv @@ -36,6 +37,11 @@ bool has_available_algo(Opr* opr, Args&&... args) { return !all_algos.empty(); } +template +bool has_no_naive_heuristic_algo(Opr* opr, Args&&... args) { + auto&& algo = opr->get_algorithm_info_heuristic(std::forward(args)...); + return !static_cast(algo.attribute & detail::Algorithm::Attribute::NAIVE); +} } // namespace megdnn // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index e5ea399c..7188d941 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -1936,6 +1936,119 @@ protected: const TensorLayout& grad_s, size_t workspace_in_bytes); }; +class LayerNormBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(LayerNormBase, OperatorBase); + DEF_OPR_PARAM(LayerNorm); + +protected: + void deduce_layout_fwd( + const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, + TensorLayout& rstd); + void check_layout_fwd( + const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean, + const TensorLayout& rstd); +}; + +class LayerNormForward : public LayerNormBase { + DEF_OPR_IMPL(LayerNormForward, LayerNormBase, 3, 3); + +public: + virtual void exec( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + _megdnn_workspace workspace) = 0; + void deduce_layout( + const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, + TensorLayout& rstd); + virtual size_t get_workspace_in_bytes( + const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean, + const TensorLayout& rstd) = 0; + +protected: + void check_exec( + const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean, + const TensorLayout& rstd, size_t workspace_in_bytes); +}; +using LayerNorm = LayerNormForward; + +class LayerNormBackward : public LayerNormBase { + DEF_OPR_IMPL(LayerNormBackward, LayerNormBase, 5, 3); + +public: + virtual void exec( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, + _megdnn_workspace workspace) = 0; + void deduce_layout( + const TensorLayout& diff, const TensorLayout& data, + const TensorLayout& weight, const TensorLayout& mean, + const TensorLayout& rstd, TensorLayout& ddata, TensorLayout& dweight, + TensorLayout& dbias); + virtual size_t get_workspace_in_bytes( + const TensorLayout& diff, const TensorLayout& data, + const TensorLayout& weight, const TensorLayout& mean, + const TensorLayout& rstd, const TensorLayout& ddata, + const TensorLayout& dweight, const TensorLayout& dbias) = 0; + +protected: + void check_exec( + const TensorLayout& diff, const TensorLayout& data, + const TensorLayout& weight, const TensorLayout& mean, + const TensorLayout& rstd, const TensorLayout& ddata, + const TensorLayout& dweight, const TensorLayout& dbias, + size_t workspace_in_bytes); +}; + +class DropoutBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(DropoutBase, OperatorBase); + DEF_OPR_PARAM(Dropout); +}; + +class DropoutForward : public DropoutBase { + DEF_OPR_IMPL(DropoutForward, DropoutBase, 1, 2); + +public: + void deduce_layout(const TensorLayout& inp, TensorLayout& oup, TensorLayout& mask); + virtual void exec( + _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes( + const TensorLayout& inp, const TensorLayout& oup, + const TensorLayout& mask) = 0; + virtual size_t get_mask_size_in_bytes(const TensorLayout& inp) = 0; + +protected: + void check_exec( + const TensorLayout& inp, const TensorLayout& oup, const TensorLayout& mask, + size_t workspace_in_bytes); +}; +using Dropout = DropoutForward; + +class DropoutBackward : public DropoutBase { + DEF_OPR_IMPL(DropoutBackward, DropoutBase, 2, 1); + +public: + void deduce_layout( + const TensorLayout& doup, const TensorLayout& mask, TensorLayout& dinp); + virtual void exec( + _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes( + const TensorLayout& doup, const TensorLayout& mask, + const TensorLayout& dinp) = 0; + +protected: + void check_exec( + const TensorLayout& doup, const TensorLayout& mask, + const TensorLayout& dinp, size_t workspace_in_bytes); +}; + } // namespace megdnn #include "megdnn/internal/opr_header_epilogue.h" diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py index 76220c99..9da6bbe9 100755 --- a/dnn/scripts/opr_param_defs.py +++ b/dnn/scripts/opr_param_defs.py @@ -1212,3 +1212,15 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), member_alias=[(i, 'PADDING_{}'.format(i)) for i in PADDING_MODES] ) ) + +(pdef('LayerNorm') + .add_fields('bool', 'affine', 'true') + .add_fields('float32', 'eps', '1e-5f') + .add_fields('uint64', 'normalized_dim', '1') + .add_fields('uint64', 'normalized_size', '1') +) + +(pdef('Dropout') + .add_fields('float32', 'drop_prob', '0') + .add_fields('uint64', 'seed', '0') + ) diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index 4e20c9a5..d0566165 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -5,168 +5,190 @@ file(GLOB_RECURSE SOURCES common/*.cpp naive/*.cpp) list(APPEND SOURCES ${PROJECT_BINARY_DIR}/genfiles/megbrain_build_config.h) if(NOT ${MGE_ARCH} STREQUAL "naive") - file(GLOB_RECURSE SOURCES_ fallback/*.cpp) + file(GLOB_RECURSE SOURCES_ fallback/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + if(${MGE_ARCH} STREQUAL "fallback") + message(WARNING "build only with fallback") + elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + file(GLOB_RECURSE SOURCES_ x86/*.cpp) list(APPEND SOURCES ${SOURCES_}) - if(${MGE_ARCH} STREQUAL "fallback") - message(WARNING "build only with fallback") - elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") - file(GLOB_RECURSE SOURCES_ x86/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - if(NOT MSVC) - file(GLOB_RECURSE SOURCES_ x86/*.S) - set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) - list(APPEND SOURCES ${SOURCES_}) - endif() - elseif(${MGE_ARCH} STREQUAL "armv7") - file(GLOB_RECURSE SOURCES_ armv7/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE SOURCES_ arm_common/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE SOURCES_ armv7/*.S) - set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) - list(APPEND SOURCES ${SOURCES_}) - elseif(${MGE_ARCH} STREQUAL "aarch64") - file(GLOB_RECURSE SOURCES_ aarch64/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE SOURCES_ arm_common/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE SOURCES_ aarch64/*.S) - set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) - list(APPEND SOURCES ${SOURCES_}) + if(NOT MSVC) + file(GLOB_RECURSE SOURCES_ x86/*.S) + set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) + list(APPEND SOURCES ${SOURCES_}) endif() + elseif(${MGE_ARCH} STREQUAL "armv7") + file(GLOB_RECURSE SOURCES_ armv7/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ arm_common/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ armv7/*.S) + set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) + list(APPEND SOURCES ${SOURCES_}) + elseif(${MGE_ARCH} STREQUAL "aarch64") + file(GLOB_RECURSE SOURCES_ aarch64/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ arm_common/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ aarch64/*.S) + set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) + list(APPEND SOURCES ${SOURCES_}) + endif() endif() if(MGE_WITH_MIDOUT_PROFILE) - list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp) + list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp) endif() -############################################################################### +# ###################################################################################### # HIP_COMPILE -############################################################################### -macro (HIP_COMPILE _hip_target _hip_objs) - # Separate the sources from the options - HIP_GET_SOURCES_AND_OPTIONS(_sources - _cmake_options - _hipcc_options - _hcc_options - _nvcc_options - ${ARGN}) - HIP_PREPARE_TARGET_COMMANDS(${_hip_target} - OBJ _generated_files _source_files ${_sources} ${_cmake_options} - HIPCC_OPTIONS ${_hipcc_options} - HCC_OPTIONS ${_hcc_options} - NVCC_OPTIONS ${_nvcc_options}) - if(_source_files) - list(REMOVE_ITEM _sources ${_source_files}) - endif() +# ###################################################################################### +macro(HIP_COMPILE _hip_target _hip_objs) + # Separate the sources from the options + hip_get_sources_and_options(_sources _cmake_options _hipcc_options _hcc_options + _nvcc_options ${ARGN}) + hip_prepare_target_commands( + ${_hip_target} + OBJ + _generated_files + _source_files + ${_sources} + ${_cmake_options} + HIPCC_OPTIONS + ${_hipcc_options} + HCC_OPTIONS + ${_hcc_options} + NVCC_OPTIONS + ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() - add_custom_target(${_hip_target}) + add_custom_target(${_hip_target}) - # set return value - set(${_hip_objs} ${_generated_files}) + # set return value + set(${_hip_objs} ${_generated_files}) endmacro() -if (MGE_WITH_ROCM) - file (GLOB_RECURSE SOURCES_ rocm/*.cpp) - list (APPEND SOURCES ${SOURCES_}) - - # FIXME rocm may lost the first hip file, so currently we just create an - # empty file to bypass this error. - file(GLOB start.cpp.hip "" ) - list(APPEND HIP_SOURCES start.cpp.hip) - configure_file( - ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h.in - ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h) - - configure_file( - ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h.in - ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h) - - file(GLOB_RECURSE HIP_SOURCES_ rocm/*.cpp.hip) - set(HIP_TARGET_NAME megdnn_hip_kernel) - set(_HIPCC_OPTIONS "-fPIC") - set(_HCC_OPTIONS "-fPIC") - set(_NVCC_OPTIONS "-fPIC") - - list(APPEND HIP_SOURCES ${HIP_SOURCES_}) - set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - HIP_INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/dnn - ${PROJECT_SOURCE_DIR}/dnn/include - ${PROJECT_BINARY_DIR}/dnn - ${PROJECT_BINARY_DIR}/genfiles - ${PROJECT_BINARY_DIR}/dnn/include - ${HIP_INCLUDE_DIR} - ${MIOPEN_INCLUDE_DIR} - ${ROCBLAS_INCLUDE_DIR} - ${ROCRAND_INCLUDE_DIR} - ${AMDOCL_INCLUDE_DIR}) - hip_compile( - ${HIP_TARGET_NAME} HIPOBJS ${HIP_SOURCES} - HIPCC_OPTIONS ${_HIPCC_OPTIONS} - HCC_OPTIONS ${_HCC_OPTIONS} - NVCC_OPTIONS ${_NVCC_OPTIONS}) - list(APPEND SOURCES ${HIPOBJS}) -endif () +if(MGE_WITH_ROCM) + file(GLOB_RECURSE SOURCES_ rocm/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + + # FIXME rocm may lost the first hip file, so currently we just create an empty file to + # bypass this error. + file(GLOB start.cpp.hip "") + list(APPEND HIP_SOURCES start.cpp.hip) + configure_file(${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h.in + ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h) + + configure_file(${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h.in + ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h) + + file(GLOB_RECURSE HIP_SOURCES_ rocm/*.cpp.hip) + set(HIP_TARGET_NAME megdnn_hip_kernel) + set(_HIPCC_OPTIONS "-fPIC") + set(_HCC_OPTIONS "-fPIC") + set(_NVCC_OPTIONS "-fPIC") + + list(APPEND HIP_SOURCES ${HIP_SOURCES_}) + set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + hip_include_directories( + ${PROJECT_SOURCE_DIR}/dnn + ${PROJECT_SOURCE_DIR}/dnn/include + ${PROJECT_BINARY_DIR}/dnn + ${PROJECT_BINARY_DIR}/genfiles + ${PROJECT_BINARY_DIR}/dnn/include + ${HIP_INCLUDE_DIR} + ${MIOPEN_INCLUDE_DIR} + ${ROCBLAS_INCLUDE_DIR} + ${ROCRAND_INCLUDE_DIR} + ${AMDOCL_INCLUDE_DIR}) + hip_compile( + ${HIP_TARGET_NAME} + HIPOBJS + ${HIP_SOURCES} + HIPCC_OPTIONS + ${_HIPCC_OPTIONS} + HCC_OPTIONS + ${_HCC_OPTIONS} + NVCC_OPTIONS + ${_NVCC_OPTIONS}) + list(APPEND SOURCES ${HIPOBJS}) +endif() if(MGE_WITH_CUDA) - file(GLOB_RECURSE SOURCES_ cuda/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ cuda/*.cpp) + list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE CUSOURCES cuda/*.cu) - - set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) - set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) - set(CUTLASS_SOURCES "") - function(gen_cutlass_kimpl op type gen_files) - set(CURRENT_CUTLASS_STAGE_DIR ${CUTLASS_GEN_DIR}/${op}_${type}.stage) - set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) - - set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS ${CUTLASS_GEN_SCRIPT}) - - file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR}) - file(MAKE_DIRECTORY ${CURRENT_CUTLASS_STAGE_DIR}) - file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) - execute_process( - COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_STAGE_DIR} - RESULT_VARIABLE gen_cutlass_result - OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log - ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log - ) - if (NOT gen_cutlass_result EQUAL 0) - message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log") - endif() - file(GLOB CUTLASS_GEN_FILES RELATIVE "${CURRENT_CUTLASS_GEN_DIR}/" "${CURRENT_CUTLASS_GEN_DIR}/*.cu") - foreach(FILE ${CUTLASS_GEN_FILES}) - if (NOT EXISTS "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}") - file(REMOVE "${CURRENT_CUTLASS_GEN_DIR}/${FILE}") - endif() - endforeach() - file(GLOB CUTLASS_GEN_FILES RELATIVE "${CURRENT_CUTLASS_STAGE_DIR}" "${CURRENT_CUTLASS_STAGE_DIR}/*.cu") - foreach(FILE ${CUTLASS_GEN_FILES}) - execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}" "${CURRENT_CUTLASS_GEN_DIR}") - endforeach() - file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR}) - file(GLOB_RECURSE CUTLASS_GEN_FILES "${CURRENT_CUTLASS_GEN_DIR}/*.cu") - list(APPEND ${gen_files} ${CUTLASS_GEN_FILES}) - set(${gen_files} "${${gen_files}}" PARENT_SCOPE) - endfunction() - gen_cutlass_kimpl(gemm simt CUTLASS_SOURCES) - gen_cutlass_kimpl(gemm tensorop884 CUTLASS_SOURCES) - gen_cutlass_kimpl(gemm tensorop1688 CUTLASS_SOURCES) - gen_cutlass_kimpl(gemv simt CUTLASS_SOURCES) - gen_cutlass_kimpl(deconv simt CUTLASS_SOURCES) - gen_cutlass_kimpl(deconv tensorop8816 CUTLASS_SOURCES) - gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) - gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) - gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) - list(APPEND SOURCES ${CUTLASS_SOURCES}) - list(APPEND SOURCES ${CUSOURCES}) + file(GLOB_RECURSE CUSOURCES cuda/*.cu) + + set(CUTLASS_GEN_SCRIPT + ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) + set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) + set(CUTLASS_SOURCES "") + function(gen_cutlass_kimpl op type gen_files) + set(CURRENT_CUTLASS_STAGE_DIR ${CUTLASS_GEN_DIR}/${op}_${type}.stage) + set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) + + set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS ${CUTLASS_GEN_SCRIPT}) + + file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR}) + file(MAKE_DIRECTORY ${CURRENT_CUTLASS_STAGE_DIR}) + file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) + execute_process( + COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations + ${op} --type ${type} ${CURRENT_CUTLASS_STAGE_DIR} + RESULT_VARIABLE gen_cutlass_result + OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log + ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log) + if(NOT gen_cutlass_result EQUAL 0) + message( + FATAL_ERROR + "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log" + ) + endif() + file( + GLOB CUTLASS_GEN_FILES + RELATIVE "${CURRENT_CUTLASS_GEN_DIR}/" + "${CURRENT_CUTLASS_GEN_DIR}/*.cu") + foreach(FILE ${CUTLASS_GEN_FILES}) + if(NOT EXISTS "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}") + file(REMOVE "${CURRENT_CUTLASS_GEN_DIR}/${FILE}") + endif() + endforeach() + file( + GLOB CUTLASS_GEN_FILES + RELATIVE "${CURRENT_CUTLASS_STAGE_DIR}" + "${CURRENT_CUTLASS_STAGE_DIR}/*.cu") + foreach(FILE ${CUTLASS_GEN_FILES}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}" "${CURRENT_CUTLASS_GEN_DIR}") + endforeach() + file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR}) + file(GLOB_RECURSE CUTLASS_GEN_FILES "${CURRENT_CUTLASS_GEN_DIR}/*.cu") + list(APPEND ${gen_files} ${CUTLASS_GEN_FILES}) + set(${gen_files} + "${${gen_files}}" + PARENT_SCOPE) + endfunction() + gen_cutlass_kimpl(gemm simt CUTLASS_SOURCES) + gen_cutlass_kimpl(gemm tensorop884 CUTLASS_SOURCES) + gen_cutlass_kimpl(gemm tensorop1688 CUTLASS_SOURCES) + gen_cutlass_kimpl(gemv simt CUTLASS_SOURCES) + gen_cutlass_kimpl(deconv simt CUTLASS_SOURCES) + gen_cutlass_kimpl(deconv tensorop8816 CUTLASS_SOURCES) + gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) + gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) + gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) + list(APPEND SOURCES ${CUTLASS_SOURCES}) + list(APPEND SOURCES ${CUSOURCES}) endif() if(MGE_WITH_ATLAS) - file(GLOB_RECURSE SOURCES_ atlas/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1) + file(GLOB_RECURSE SOURCES_ atlas/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1) endif() add_definitions(${LIBMEGDNN_DEF}) @@ -174,81 +196,85 @@ add_library(megdnn EXCLUDE_FROM_ALL OBJECT ${SOURCES}) target_link_libraries(megdnn PUBLIC opr_param_defs) if(MGE_WITH_CUDA) - target_link_libraries(megdnn PRIVATE $) - target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR}) + target_link_libraries(megdnn PRIVATE $) + target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR}) endif() if(MGE_WITH_ROCM) - target_include_directories(megdnn PUBLIC - ${HIP_INCLUDE_DIR} - ${MIOPEN_INCLUDE_DIR} - ${ROCBLAS_INCLUDE_DIR} - ${ROCRAND_INCLUDE_DIR} - ${AMDOCL_INCLUDE_DIR}) - target_link_directories(megdnn PUBLIC - ${HIP_LIBRARY_DIR} - ${MIOPEN_LIBRARY_DIR} - ${ROCBLAS_LIBRARY_DIR} - ${ROCRAND_LIBRARY_DIR} - ${AMDOCL_LIBRARY_DIR}) + target_include_directories( + megdnn PUBLIC ${HIP_INCLUDE_DIR} ${MIOPEN_INCLUDE_DIR} ${ROCBLAS_INCLUDE_DIR} + ${ROCRAND_INCLUDE_DIR} ${AMDOCL_INCLUDE_DIR}) + target_link_directories( + megdnn + PUBLIC + ${HIP_LIBRARY_DIR} + ${MIOPEN_LIBRARY_DIR} + ${ROCBLAS_LIBRARY_DIR} + ${ROCRAND_LIBRARY_DIR} + ${AMDOCL_LIBRARY_DIR}) endif() -if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64") - if(MGE_ENABLE_CPUINFO) - target_link_libraries(megdnn PRIVATE $) - endif() +if(${MGE_ARCH} STREQUAL "x86_64" + OR ${MGE_ARCH} STREQUAL "i386" + OR ${MGE_ARCH} STREQUAL "armv7" + OR ${MGE_ARCH} STREQUAL "aarch64") + if(MGE_ENABLE_CPUINFO) + target_link_libraries(megdnn PRIVATE $) + endif() endif() -target_include_directories(megdnn - PUBLIC - $ - $ - $ - PRIVATE - ${PROJECT_SOURCE_DIR}/dnn - ${PROJECT_SOURCE_DIR}/third_party/midout/src -) +target_include_directories( + megdnn + PUBLIC $ + $ + $ + PRIVATE ${PROJECT_SOURCE_DIR}/dnn ${PROJECT_SOURCE_DIR}/third_party/midout/src) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/dnn/include DESTINATION . FILES_MATCHING PATTERN "*.h*") +install( + DIRECTORY ${PROJECT_SOURCE_DIR}/dnn/include + DESTINATION . + FILES_MATCHING + PATTERN "*.h*") if(CXX_SUPPORT_WCLASS_MEMACCESS) - if(MGE_WITH_CUDA) - target_compile_options(megdnn PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" - "$<$>:-Wno-class-memaccess>") - else() - target_compile_options(megdnn PRIVATE "-Wno-class-memaccess") - endif() + if(MGE_WITH_CUDA) + target_compile_options( + megdnn PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" + "$<$>:-Wno-class-memaccess>") + else() + target_compile_options(megdnn PRIVATE "-Wno-class-memaccess") + endif() endif() target_compile_definitions(megdnn INTERFACE ${LIBMEGDNN_DEF}) if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64") - if (BUILD_SHARED_LIBS) - target_link_libraries(megdnn PRIVATE $) - else() - target_link_libraries(megdnn PRIVATE dnnl) - endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(megdnn PRIVATE $) + else() + target_link_libraries(megdnn PRIVATE dnnl) + endif() endif() -if (BUILD_SHARED_LIBS) - target_link_libraries(megdnn PRIVATE $) +if(BUILD_SHARED_LIBS) + target_link_libraries(megdnn PRIVATE $) else() - target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) + target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) endif() -if (MGE_WITH_ROCM) - target_link_libraries(megdnn PRIVATE ${HIPOBJS} ${MGE_ROCM_LIBS}) -endif () +if(MGE_WITH_ROCM) + target_link_libraries(megdnn PRIVATE ${HIPOBJS} ${MGE_ROCM_LIBS}) +endif() if(MGE_WITH_ATLAS) - if (BUILD_SHARED_LIBS) - target_link_libraries(megdnn PRIVATE $) - else() - target_link_libraries(megdnn PRIVATE ${MGE_ATLAS_LIBS}) - endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(megdnn PRIVATE $) + else() + target_link_libraries(megdnn PRIVATE ${MGE_ATLAS_LIBS}) + endif() endif() if(CMAKE_THREAD_LIBS_INIT) - target_link_libraries(megdnn PRIVATE Threads::Threads) + target_link_libraries(megdnn PRIVATE Threads::Threads) endif() install(TARGETS megdnn EXPORT ${MGE_EXPORT_TARGETS}) diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp index 30874d33..200769d6 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" -INSTANTIATION_CONV_S1(2); \ No newline at end of file +INSTANTIATION_CONV_S1_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..c6c974a5 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp new file mode 100644 index 00000000..6f075a54 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_NO_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp index be37fc1c..a9728847 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" -INSTANTIATION_CONV_S2(5); \ No newline at end of file +INSTANTIATION_CONV_S2_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..ae899e2c --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp new file mode 100644 index 00000000..94c09aea --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_NO_BIAS(2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp index 689df883..0047c51e 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" -INSTANTIATION_CONV_S1(5); \ No newline at end of file +INSTANTIATION_CONV_S1_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..c273dede --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp new file mode 100644 index 00000000..719dbd1d --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_NO_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp index 355c9ed6..01209f9c 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" -INSTANTIATION_CONV_S2(2); \ No newline at end of file +INSTANTIATION_CONV_S2_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..7bed53e2 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp new file mode 100644 index 00000000..9aa190df --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_NO_BIAS(3); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp index 6d99b01b..5cbcb78a 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" -INSTANTIATION_CONV_S1(3); \ No newline at end of file +INSTANTIATION_CONV_S1_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..bcf92bab --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp new file mode 100644 index 00000000..d944b02b --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_NO_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp index a7571939..a75f159a 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" -INSTANTIATION_CONV_S2(7); +INSTANTIATION_CONV_S2_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..ff9653ea --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp new file mode 100644 index 00000000..a2705bde --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_NO_BIAS(5); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp index db2be268..47cbf3d7 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" -INSTANTIATION_CONV_S1(7); \ No newline at end of file +INSTANTIATION_CONV_S1_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..f8fa2c29 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp new file mode 100644 index 00000000..c7824aad --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h" +INSTANTIATION_CONV_S1_NO_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp index 6075a8ca..fd603f3b 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" -INSTANTIATION_CONV_S2(3); \ No newline at end of file +INSTANTIATION_CONV_S2_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..bd1e5c29 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp new file mode 100644 index 00000000..0caee33c --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h" +INSTANTIATION_CONV_S2_NO_BIAS(7); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h index bf9418b4..3915caea 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h @@ -469,9 +469,12 @@ void conv_bias::conv_direct_fp32_nchw44( INSTANTIATION(filter_size, bias, HSwishOp) \ INSTANTIATION(filter_size, bias, SigmoidOp) -#define INSTANTIATION_CONV_S1(filter_size) \ - FOR_OP(filter_size, BiasMode::NO_BIAS) \ - FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) \ - FOR_OP(filter_size, BiasMode::BIAS) +#define INSTANTIATION_CONV_S1_NO_BIAS(filter_size) \ + FOR_OP(filter_size, BiasMode::NO_BIAS) -// vim: syntax=cpp.doxygen \ No newline at end of file +#define INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(filter_size) \ + FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) + +#define INSTANTIATION_CONV_S1_BIAS(filter_size) FOR_OP(filter_size, BiasMode::BIAS) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h index b31cd438..cbbf047a 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h @@ -550,9 +550,12 @@ void conv_bias::conv_direct_fp32_nchw44( INSTANTIATION(filter_size, bias, HSwishOp) \ INSTANTIATION(filter_size, bias, SigmoidOp) -#define INSTANTIATION_CONV_S2(filter_size) \ - FOR_OP(filter_size, BiasMode::NO_BIAS) \ - FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) \ - FOR_OP(filter_size, BiasMode::BIAS) +#define INSTANTIATION_CONV_S2_NO_BIAS(filter_size) \ + FOR_OP(filter_size, BiasMode::NO_BIAS) -// vim: syntax=cpp.doxygen \ No newline at end of file +#define INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(filter_size) \ + FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) + +#define INSTANTIATION_CONV_S2_BIAS(filter_size) FOR_OP(filter_size, BiasMode::BIAS) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp index 291741b9..e82c464e 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" -INSTANCE_CONV(2, 1); \ No newline at end of file +INSTANCE_CONV_BIAS(2, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..fbdb5ec7 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp new file mode 100644 index 00000000..97f2595c --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(2, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp index d3f360c5..3acba768 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" -INSTANCE_CONV(2, 2); +INSTANCE_CONV_BIAS(2, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..1e4c7197 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp new file mode 100644 index 00000000..03bc548f --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(2, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp index 432708e7..89bc21d5 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" -INSTANCE_CONV(3, 1); +INSTANCE_CONV_BIAS(3, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..fe811030 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp new file mode 100644 index 00000000..88cbe5f8 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(3, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp similarity index 86% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp index 38ffe8ef..f2f02815 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -11,4 +11,5 @@ * implied. */ #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" -INSTANCE_CONV(3, 2); +INSTANCE_CONV_BIAS(3, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..416e9839 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp new file mode 100644 index 00000000..bf3f792d --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(3, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp new file mode 100644 index 00000000..f4d38c4e --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BIAS(5, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..1dcb60b0 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp new file mode 100644 index 00000000..e32fbccb --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(5, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp new file mode 100644 index 00000000..a818401f --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BIAS(5, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..be387827 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp new file mode 100644 index 00000000..64c9db59 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(5, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp new file mode 100644 index 00000000..6fb2e117 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BIAS(7, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp new file mode 100644 index 00000000..74ad5102 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(7, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp new file mode 100644 index 00000000..94af0cbd --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(7, 1); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp new file mode 100644 index 00000000..576213bc --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BIAS(7, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp new file mode 100644 index 00000000..58890e90 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(7, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp new file mode 100644 index 00000000..4a4e0f35 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp @@ -0,0 +1,15 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +INSTANCE_CONV_NO_BIAS(7, 2); +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h index 36daabc8..1869f2ff 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h +++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h @@ -928,9 +928,11 @@ void fp32_direct_nchw_nchw44::conv_direct_fp32_nchw_nchw44( INSTANTIATION(stride, filter, bias, ReluOp) \ INSTANTIATION(stride, filter, bias, HSwishOp) -#define INSTANCE_CONV(filter, stride) \ - FOR_OP(stride, filter, BiasMode::NO_BIAS) \ - FOR_OP(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS) \ - FOR_OP(stride, filter, BiasMode::BIAS) +#define INSTANCE_CONV_NO_BIAS(filter, stride) FOR_OP(stride, filter, BiasMode::NO_BIAS) + +#define INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(filter, stride) \ + FOR_OP(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS) + +#define INSTANCE_CONV_BIAS(filter, stride) FOR_OP(stride, filter, BiasMode::BIAS) // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h similarity index 97% rename from dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp rename to dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h index cc9624f3..48d7bd1d 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -265,7 +265,8 @@ void conv_direct_sdot_int8_nchw44( #define INSTANTIATION(dst_type, stride, filter_size, bias_mode, Op) \ template void \ - conv_direct_sdot_int8_nchw44( \ + megdnn::arm_common::direct_dotprod_nchw44::conv_direct_sdot_int8_nchw44< \ + dst_type, stride, bias_mode, Op, filter_size>( \ dst_type * dst, const int oh, const int ow, const int8_t* src, \ const int ih, const int iw, const int8_t* weight, const int32_t* bias, \ const int oh_size, const int oc, const int ic, const Op& op); @@ -284,22 +285,6 @@ void conv_direct_sdot_int8_nchw44( FOR_OP(stride, i, BiasMode::NO_BIAS) \ FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS) -#define FOR_FILTER(stride) \ - FOR_BIAS(stride, 2) \ - FOR_BIAS(stride, 3) \ - FOR_BIAS(stride, 5) \ - FOR_BIAS(stride, 7) - -FOR_FILTER(1) - -#undef FOR_STRIDE -#undef FOR_FILTER -#undef FOR_IC -#undef FOR_BIAS -#undef FOR_NONLINEAR -#undef FOR_REMAIN -#undef INSTANTIATION - } // namespace direct_dotprod_nchw44 } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp new file mode 100644 index 00000000..66ae2846 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(1, 2); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp new file mode 100644 index 00000000..faf8f46c --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(1, 3); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp new file mode 100644 index 00000000..94fe0811 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(1, 5); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp new file mode 100644 index 00000000..001c58a9 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(1, 7); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h similarity index 97% rename from dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp rename to dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h index 841e868f..21ce451b 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -266,7 +266,8 @@ void conv_direct_sdot_int8_nchw44( #define INSTANTIATION(dst_type, stride, filter_size, bias_mode, Op) \ template void \ - conv_direct_sdot_int8_nchw44( \ + megdnn::arm_common::direct_dotprod_nchw44::conv_direct_sdot_int8_nchw44< \ + dst_type, stride, bias_mode, Op, filter_size>( \ dst_type * dst, const int oh, const int ow, const int8_t* src, \ const int ih, const int iw, const int8_t* weight, const int32_t* bias, \ const int oh_size, const int oc, const int ic, const Op& op); @@ -285,22 +286,6 @@ void conv_direct_sdot_int8_nchw44( FOR_OP(stride, i, BiasMode::NO_BIAS) \ FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS) -#define FOR_FILTER(stride) \ - FOR_BIAS(stride, 2) \ - FOR_BIAS(stride, 3) \ - FOR_BIAS(stride, 5) \ - FOR_BIAS(stride, 7) - -FOR_FILTER(2) - -#undef FOR_STRIDE -#undef FOR_FILTER -#undef FOR_IC -#undef FOR_BIAS -#undef FOR_NONLINEAR -#undef FOR_REMAIN -#undef INSTANTIATION - } // namespace direct_dotprod_nchw44 } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp new file mode 100644 index 00000000..521ce682 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(2, 2); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp new file mode 100644 index 00000000..d2af6eca --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(2, 3); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp new file mode 100644 index 00000000..949c105e --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(2, 5); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp new file mode 100644 index 00000000..4337a3e5 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp @@ -0,0 +1,21 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h" +#if MGB_ENABLE_DOT +using namespace megdnn; +using namespace arm_common; + +FOR_BIAS(2, 7); + +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h index 74a11e4f..174c48c3 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -45,4 +45,4 @@ public: } // namespace arm_common } // namespace megdnn -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp index f238a1fc..3373cf9a 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp @@ -13,336 +13,9 @@ #include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h" #include "src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h" + namespace megdnn { namespace arm_common { -namespace { -/** - * @brief core code for calculation patten - * - * @tparam src_idx is offset of src reg - * @tparam weight_idx is offset of weight reg - * @tparam c_dim is output channel - * @tparam Func mla operation funcion - * @tparam stride - * @tparam T outpur regs type - * @tparam T2 src regs type - * @tparam T3 weight regs type - * @tparam T4 temp regs type - */ - -template < - int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, - typename T3, typename T4> -struct ShiftCalHelper { - static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp); - static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); -}; -template < - int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, - typename T3, typename T4> -MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) { - ShiftCalHelper::impl( - c, src, weight, temp); -} -template < - int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, - typename T3> -MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { - ShiftCalHelper::impl( - c, src, weight); -}; -template < - int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4> -struct ShiftCalHelper { - static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { - c[0][0] = vdotq_s32_h( - src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); - c[1][0] = vdotq_s32_h( - src[(0 + src_idx) % 8], weight[1][weight_idx], c[1][0], temp[1]); - c[0][1] = vdotq_s32_h( - src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[2]); - c[1][1] = vdotq_s32_h( - src[(1 + src_idx) % 8], weight[1][weight_idx], c[1][1], temp[3]); - c[0][2] = vdotq_s32_h( - src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[0]); - c[1][2] = vdotq_s32_h( - src[(2 + src_idx) % 8], weight[1][weight_idx], c[1][2], temp[1]); - c[0][3] = vdotq_s32_h( - src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[2]); - c[1][3] = vdotq_s32_h( - src[(3 + src_idx) % 8], weight[1][weight_idx], c[1][3], temp[3]); - - c[0][4] = vdotq_s32_h( - src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]); - c[1][4] = vdotq_s32_h( - src[(4 + src_idx) % 8], weight[1][weight_idx], c[1][4], temp[1]); - c[0][5] = vdotq_s32_h( - src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[2]); - c[1][5] = vdotq_s32_h( - src[(5 + src_idx) % 8], weight[1][weight_idx], c[1][5], temp[3]); - c[0][6] = vdotq_s32_h( - src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[0]); - c[1][6] = vdotq_s32_h( - src[(6 + src_idx) % 8], weight[1][weight_idx], c[1][6], temp[1]); - c[0][7] = vdotq_s32_h( - src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[2]); - c[1][7] = vdotq_s32_h( - src[(7 + src_idx) % 8], weight[1][weight_idx], c[1][7], temp[3]); - } - static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); -}; -template < - int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4> -struct ShiftCalHelper { - static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { - c[0][0] = vdotq_s32_h( - src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); - c[0][1] = vdotq_s32_h( - src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[1]); - c[0][2] = vdotq_s32_h( - src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[2]); - c[0][3] = vdotq_s32_h( - src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[3]); - c[0][4] = vdotq_s32_h( - src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]); - c[0][5] = vdotq_s32_h( - src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[1]); - c[0][6] = vdotq_s32_h( - src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[2]); - c[0][7] = vdotq_s32_h( - src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[3]); - } - static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); -}; - -template -struct KerNeonXXs2NchwNchw44 { - static void impl( - const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, - int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { - constexpr int stride = 1; - constexpr int filter_height = 1; - constexpr int filter_width = 4; - constexpr int oc_step = 4; - constexpr int loop_ic_step = 1; - constexpr int simd_len = 16; - constexpr int pack_iw_len = 16; - constexpr int src_reg = 8; - constexpr int weight_reg = 1; - - const int ic_stride = ih * iw * pack_iw_len; - const int ld_weight_oc = oc_step * filter_height * filter_width * ic; - constexpr int c_dim = OCHelper::val; - int32x4_t c[c_dim][8]; - init_ocx_ow8(c, bias_ptr, oc_step); - - for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; - int8x16_t src[src_reg]; - int8x16_t dot4_weight[c_dim][weight_reg]; - int16x8_t temp_c[4]; - load_helper( - dot4_weight, weight_ptr, ld_weight_oc); - load_helper( - src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - - weight_ptr += oc_step * filter_height * filter_width; - } - - store_ocx_ow8_remain_static_dt( - c, op, dst_ptr, ld_dst_oc); - } -}; - -template -struct KerNeonXXs2NchwNchw44 { - static void impl( - const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, - int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { - constexpr int stride = 1; - constexpr int filter_height = 2; - constexpr int filter_width = 4; - constexpr int oc_step = 4; - constexpr int loop_ic_step = 1; - constexpr int simd_len = 16; - constexpr int pack_iw_len = 16; - constexpr int src_reg = 8; - constexpr int weight_reg = 1; - - const int ic_stride = ih * iw * pack_iw_len; - const int ld_weight_oc = oc_step * filter_height * filter_width * ic; - constexpr int c_dim = OCHelper::val; - int32x4_t c[c_dim][8]; - init_ocx_ow8(c, bias_ptr, oc_step); - - for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; - int8x16_t src[src_reg]; - int8x16_t dot4_weight[c_dim][weight_reg]; - int16x8_t temp_c[4]; - load_helper( - dot4_weight, weight_ptr, ld_weight_oc); - load_helper( - src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - - load_helper( - dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc); - load_helper( - src, nchw_src_ptr + 1 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - - weight_ptr += oc_step * filter_height * filter_width; - } - - store_ocx_ow8_remain_static_dt( - c, op, dst_ptr, ld_dst_oc); - } -}; - -template -struct KerNeonXXs2NchwNchw44 { - static void impl( - const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, - int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { - constexpr int stride = 1; - constexpr int filter_height = 3; - constexpr int filter_width = 4; - constexpr int oc_step = 4; - constexpr int loop_ic_step = 1; - constexpr int simd_len = 16; - constexpr int pack_iw_len = 16; - constexpr int src_reg = 8; - constexpr int weight_reg = 1; - - const int ic_stride = ih * iw * pack_iw_len; - const int ld_weight_oc = oc_step * filter_height * filter_width * ic; - constexpr int c_dim = OCHelper::val; - int32x4_t c[c_dim][8]; - init_ocx_ow8(c, bias_ptr, oc_step); - - for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; - int8x16_t src[src_reg]; - int8x16_t dot4_weight[c_dim][weight_reg]; - int16x8_t temp_c[4]; - load_helper( - dot4_weight, weight_ptr, ld_weight_oc); - - load_helper( - src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - load_helper( - dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc); - - load_helper( - src, nchw_src_ptr + 1 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - - load_helper( - dot4_weight, weight_ptr + 2 * filter_width * oc_step, ld_weight_oc); - load_helper( - src, nchw_src_ptr + 2 * iw * pack_iw_len, 0); - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); - - weight_ptr += oc_step * filter_height * filter_width; - } - store_ocx_ow8_remain_static_dt( - c, op, dst_ptr, ld_dst_oc); - } -}; - -template -struct KerNeonXXs2NchwNchw44 { - static void impl( - const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, - int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { - constexpr int stride = 1; - constexpr int filter_height = 5; - constexpr int filter_width = 8; - constexpr int oc_step = 4; - constexpr int loop_ic_step = 1; - constexpr int simd_len = 16; - constexpr int pack_iw_len = 16; - constexpr int src_reg = 8; - constexpr int weight_reg = 2; - - const int ic_stride = ih * iw * pack_iw_len; - const int ld_weight_oc = oc_step * filter_height * filter_width * ic; - constexpr int c_dim = OCHelper::val; - int32x4_t c[c_dim][8]; - init_ocx_ow8(c, bias_ptr, oc_step); - - for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; - int8x16_t src[src_reg]; - int8x16_t dot4_weight[c_dim][weight_reg]; - int16x8_t temp_c[4]; -#define cb(step) \ - load_helper( \ - dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc); \ - load_helper( \ - src, nchw_src_ptr + step * iw * pack_iw_len, 0); \ - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); \ - load_helper<4, 0, simd_len, 0, Vld1q_s8>( \ - src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \ - cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c); - UNROLL_CALL_RAW(5, cb); -#undef cb - weight_ptr += oc_step * filter_height * filter_width; - } - store_ocx_ow8_remain_static_dt( - c, op, dst_ptr, ld_dst_oc); - } -}; - -template -struct KerNeonXXs2NchwNchw44 { - static void impl( - const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, - int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { - constexpr int stride = 1; - constexpr int filter_height = 7; - constexpr int filter_width = 8; - constexpr int oc_step = 4; - constexpr int loop_ic_step = 1; - constexpr int simd_len = 16; - constexpr int pack_iw_len = 16; - constexpr int src_reg = 8; - constexpr int weight_reg = 2; - - const int ic_stride = ih * iw * pack_iw_len; - const int ld_weight_oc = oc_step * filter_height * filter_width * ic; - constexpr int c_dim = OCHelper::val; - int32x4_t c[c_dim][8]; - init_ocx_ow8(c, bias_ptr, oc_step); - - for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; - int8x16_t src[src_reg]; - int8x16_t dot4_weight[c_dim][weight_reg]; - int16x8_t temp_c[4]; -#define cb(step) \ - load_helper( \ - dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc); \ - load_helper( \ - src, nchw_src_ptr + step * iw * pack_iw_len, 0); \ - cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); \ - load_helper<4, 0, simd_len, 0, Vld1q_s8>( \ - src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \ - cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c); - - UNROLL_CALL_RAW(7, cb); -#undef cb - weight_ptr += oc_step * filter_height * filter_width; - } - store_ocx_ow8_remain_static_dt( - c, op, dst_ptr, ld_dst_oc); - } -}; -} // namespace - namespace int8_direct_nchw_nchw44 { /** * pack {oc / 4, fh, fw, ic, 4(oc)} to {oc / 4, ic, fh ,fw/4, 4(oc)*4(fw)} @@ -444,115 +117,9 @@ void pack_nchw_src_for_nchw44_conv<1>( } } -template -struct ConvDiectStrideInt8NchwNchw44 { - static void impl( - const int8_t* src, const int8_t* filter, const int32_t* bias, int32_t* temp, - int8_t* dst, const size_t oc, const size_t ic, const size_t ih, - const size_t iw, const size_t oh, const size_t ow, const Op& op) { - MEGDNN_MARK_USED_VAR(temp); - constexpr int stride = 1; - constexpr size_t fh = filter_size; - constexpr size_t fw = (filter_size + 3) / 4 * 4; - constexpr size_t ic_step = 1; - constexpr size_t big_oc_step = 8; - constexpr size_t oc_step = 4; - constexpr size_t ih_step = 1; - constexpr size_t oh_step = 1; - constexpr size_t ow_step = 8; - constexpr size_t stride_h = stride; - constexpr size_t stride_w = stride; - constexpr int pack_iw_len = 16; - - const size_t img_stride = oh * ow; - const size_t ow_end = ow / ow_step * ow_step; - const size_t ow_remain = ow - ow_end; - const size_t oc_end = oc / big_oc_step * big_oc_step; - const size_t oc_remain = oc - oc_end; - const int ld_dst_oc = oc_step * img_stride; - - using remain_fun = std::function; - remain_fun kern_big_oc_remain = nullptr; - remain_fun kern_small_oc_remain = nullptr; - switch (ow_remain) { -#define cb(step) \ - case step: \ - kern_big_oc_remain = KerNeonXXs2NchwNchw44< \ - bias_mode, Op, step, filter_size, big_oc_step, stride>::impl; \ - kern_small_oc_remain = KerNeonXXs2NchwNchw44< \ - bias_mode, Op, step, filter_size, oc_step, stride>::impl; \ - break; - - UNROLL_CALL_RAW(8, cb); - default: - megdnn_assert(0, "no remain %zu for kern", ow_remain); - } - - for (size_t oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) { - const size_t weight_offset = oc_idx * ic * fh * fw; - for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) { - for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { - const size_t src_offset = - (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * - ic_step * pack_iw_len; - const size_t dst_offset = - oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; - - KerNeonXXs2NchwNchw44< - bias_mode, Op, ow_step, filter_size, big_oc_step, stride>:: - impl(src + src_offset, filter + weight_offset, - bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc, - op); - } - if (ow_remain > 0) { - const size_t src_offset = - (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * - ic_step * pack_iw_len; - const size_t dst_offset = - oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; - kern_big_oc_remain( - src + src_offset, filter + weight_offset, bias + oc_idx, - dst + dst_offset, ic, ih, iw, ld_dst_oc, op); - } - } - } - - if (oc_remain > 0) { - size_t oc_idx = oc_end; - const size_t weight_offset = oc_idx * ic * fh * fw; - for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) { - for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { - const size_t src_offset = - (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * - ic_step * pack_iw_len; - const size_t dst_offset = - oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; - KerNeonXXs2NchwNchw44< - bias_mode, Op, ow_step, filter_size, oc_step, stride>:: - impl(src + src_offset, filter + weight_offset, - bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc, - op); - } - if (ow_remain > 0) { - const size_t src_offset = - (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * - ic_step * pack_iw_len; - const size_t dst_offset = - oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; - kern_small_oc_remain( - src + src_offset, filter + weight_offset, bias + oc_idx, - dst + dst_offset, ic, ih, iw, ld_dst_oc, op); - } - } - } - } -}; - #define INSTANCE_CONV_KERN_FUN(stride, filter_size, bias_mode, Op) \ - template struct ConvDiectStrideInt8NchwNchw44; + template struct megdnn::arm_common::int8_direct_nchw_nchw44:: \ + ConvDiectStrideInt8NchwNchw44; #define INSTANCE_OP_PARAM(stride, filter, bias_mode) \ INSTANCE_CONV_KERN_FUN( \ @@ -566,17 +133,10 @@ struct ConvDiectStrideInt8NchwNchw44 { INSTANCE_OP_PARAM(stride, filter, BiasMode::NO_BIAS) \ INSTANCE_OP_PARAM(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS) -#define INSTANCE_CONV_KERN(stride) \ - INSTANCE_BIAS_MODE_PARAM(stride, 1) \ - INSTANCE_BIAS_MODE_PARAM(stride, 2) \ - INSTANCE_BIAS_MODE_PARAM(stride, 3) \ - INSTANCE_BIAS_MODE_PARAM(stride, 5) \ - INSTANCE_BIAS_MODE_PARAM(stride, 7) - -INSTANCE_CONV_KERN(1); +#define INSTANCE_CONV_KERN(stride, filter) INSTANCE_BIAS_MODE_PARAM(stride, filter) } // namespace int8_direct_nchw_nchw44 } // namespace arm_common } // namespace megdnn -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h new file mode 100644 index 00000000..4ae7f4f7 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h @@ -0,0 +1,481 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h" +#include "src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h" + +namespace megdnn { +namespace arm_common { +namespace { +/** + * @brief core code for calculation patten + * + * @tparam src_idx is offset of src reg + * @tparam weight_idx is offset of weight reg + * @tparam c_dim is output channel + * @tparam Func mla operation funcion + * @tparam stride + * @tparam T outpur regs type + * @tparam T2 src regs type + * @tparam T3 weight regs type + * @tparam T4 temp regs type + */ + +template < + int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, + typename T3, typename T4> +struct ShiftCalHelper { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp); + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); +}; +template < + int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, + typename T3, typename T4> +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) { + ShiftCalHelper::impl( + c, src, weight, temp); +} +template < + int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2, + typename T3> +MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { + ShiftCalHelper::impl( + c, src, weight); +}; +template < + int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4> +struct ShiftCalHelper { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { + c[0][0] = vdotq_s32_h( + src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); + c[1][0] = vdotq_s32_h( + src[(0 + src_idx) % 8], weight[1][weight_idx], c[1][0], temp[1]); + c[0][1] = vdotq_s32_h( + src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[2]); + c[1][1] = vdotq_s32_h( + src[(1 + src_idx) % 8], weight[1][weight_idx], c[1][1], temp[3]); + c[0][2] = vdotq_s32_h( + src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[0]); + c[1][2] = vdotq_s32_h( + src[(2 + src_idx) % 8], weight[1][weight_idx], c[1][2], temp[1]); + c[0][3] = vdotq_s32_h( + src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[2]); + c[1][3] = vdotq_s32_h( + src[(3 + src_idx) % 8], weight[1][weight_idx], c[1][3], temp[3]); + + c[0][4] = vdotq_s32_h( + src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]); + c[1][4] = vdotq_s32_h( + src[(4 + src_idx) % 8], weight[1][weight_idx], c[1][4], temp[1]); + c[0][5] = vdotq_s32_h( + src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[2]); + c[1][5] = vdotq_s32_h( + src[(5 + src_idx) % 8], weight[1][weight_idx], c[1][5], temp[3]); + c[0][6] = vdotq_s32_h( + src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[0]); + c[1][6] = vdotq_s32_h( + src[(6 + src_idx) % 8], weight[1][weight_idx], c[1][6], temp[1]); + c[0][7] = vdotq_s32_h( + src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[2]); + c[1][7] = vdotq_s32_h( + src[(7 + src_idx) % 8], weight[1][weight_idx], c[1][7], temp[3]); + } + static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); +}; +template < + int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4> +struct ShiftCalHelper { + static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) { + c[0][0] = vdotq_s32_h( + src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]); + c[0][1] = vdotq_s32_h( + src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[1]); + c[0][2] = vdotq_s32_h( + src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[2]); + c[0][3] = vdotq_s32_h( + src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[3]); + c[0][4] = vdotq_s32_h( + src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]); + c[0][5] = vdotq_s32_h( + src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[1]); + c[0][6] = vdotq_s32_h( + src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[2]); + c[0][7] = vdotq_s32_h( + src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[3]); + } + static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&); +}; + +template +struct KerNeonXXs2NchwNchw44 { + static void impl( + const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, + int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { + constexpr int stride = 1; + constexpr int filter_height = 1; + constexpr int filter_width = 4; + constexpr int oc_step = 4; + constexpr int loop_ic_step = 1; + constexpr int simd_len = 16; + constexpr int pack_iw_len = 16; + constexpr int src_reg = 8; + constexpr int weight_reg = 1; + + const int ic_stride = ih * iw * pack_iw_len; + const int ld_weight_oc = oc_step * filter_height * filter_width * ic; + constexpr int c_dim = OCHelper::val; + int32x4_t c[c_dim][8]; + init_ocx_ow8(c, bias_ptr, oc_step); + + for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; + int8x16_t src[src_reg]; + int8x16_t dot4_weight[c_dim][weight_reg]; + int16x8_t temp_c[4]; + load_helper( + dot4_weight, weight_ptr, ld_weight_oc); + load_helper( + src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + + weight_ptr += oc_step * filter_height * filter_width; + } + + store_ocx_ow8_remain_static_dt( + c, op, dst_ptr, ld_dst_oc); + } +}; + +template +struct KerNeonXXs2NchwNchw44 { + static void impl( + const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, + int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { + constexpr int stride = 1; + constexpr int filter_height = 2; + constexpr int filter_width = 4; + constexpr int oc_step = 4; + constexpr int loop_ic_step = 1; + constexpr int simd_len = 16; + constexpr int pack_iw_len = 16; + constexpr int src_reg = 8; + constexpr int weight_reg = 1; + + const int ic_stride = ih * iw * pack_iw_len; + const int ld_weight_oc = oc_step * filter_height * filter_width * ic; + constexpr int c_dim = OCHelper::val; + int32x4_t c[c_dim][8]; + init_ocx_ow8(c, bias_ptr, oc_step); + + for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; + int8x16_t src[src_reg]; + int8x16_t dot4_weight[c_dim][weight_reg]; + int16x8_t temp_c[4]; + load_helper( + dot4_weight, weight_ptr, ld_weight_oc); + load_helper( + src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + + load_helper( + dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc); + load_helper( + src, nchw_src_ptr + 1 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + + weight_ptr += oc_step * filter_height * filter_width; + } + + store_ocx_ow8_remain_static_dt( + c, op, dst_ptr, ld_dst_oc); + } +}; + +template +struct KerNeonXXs2NchwNchw44 { + static void impl( + const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, + int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { + constexpr int stride = 1; + constexpr int filter_height = 3; + constexpr int filter_width = 4; + constexpr int oc_step = 4; + constexpr int loop_ic_step = 1; + constexpr int simd_len = 16; + constexpr int pack_iw_len = 16; + constexpr int src_reg = 8; + constexpr int weight_reg = 1; + + const int ic_stride = ih * iw * pack_iw_len; + const int ld_weight_oc = oc_step * filter_height * filter_width * ic; + constexpr int c_dim = OCHelper::val; + int32x4_t c[c_dim][8]; + init_ocx_ow8(c, bias_ptr, oc_step); + + for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; + int8x16_t src[src_reg]; + int8x16_t dot4_weight[c_dim][weight_reg]; + int16x8_t temp_c[4]; + load_helper( + dot4_weight, weight_ptr, ld_weight_oc); + + load_helper( + src, nchw_src_ptr + 0 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + load_helper( + dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc); + + load_helper( + src, nchw_src_ptr + 1 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + + load_helper( + dot4_weight, weight_ptr + 2 * filter_width * oc_step, ld_weight_oc); + load_helper( + src, nchw_src_ptr + 2 * iw * pack_iw_len, 0); + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); + + weight_ptr += oc_step * filter_height * filter_width; + } + store_ocx_ow8_remain_static_dt( + c, op, dst_ptr, ld_dst_oc); + } +}; + +template +struct KerNeonXXs2NchwNchw44 { + static void impl( + const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, + int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { + constexpr int stride = 1; + constexpr int filter_height = 5; + constexpr int filter_width = 8; + constexpr int oc_step = 4; + constexpr int loop_ic_step = 1; + constexpr int simd_len = 16; + constexpr int pack_iw_len = 16; + constexpr int src_reg = 8; + constexpr int weight_reg = 2; + + const int ic_stride = ih * iw * pack_iw_len; + const int ld_weight_oc = oc_step * filter_height * filter_width * ic; + constexpr int c_dim = OCHelper::val; + int32x4_t c[c_dim][8]; + init_ocx_ow8(c, bias_ptr, oc_step); + + for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; + int8x16_t src[src_reg]; + int8x16_t dot4_weight[c_dim][weight_reg]; + int16x8_t temp_c[4]; +#define cb(step) \ + load_helper( \ + dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc); \ + load_helper( \ + src, nchw_src_ptr + step * iw * pack_iw_len, 0); \ + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); \ + load_helper<4, 0, simd_len, 0, Vld1q_s8>( \ + src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \ + cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c); + UNROLL_CALL_RAW(5, cb); +#undef cb + weight_ptr += oc_step * filter_height * filter_width; + } + store_ocx_ow8_remain_static_dt( + c, op, dst_ptr, ld_dst_oc); + } +}; + +template +struct KerNeonXXs2NchwNchw44 { + static void impl( + const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr, + int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) { + constexpr int stride = 1; + constexpr int filter_height = 7; + constexpr int filter_width = 8; + constexpr int oc_step = 4; + constexpr int loop_ic_step = 1; + constexpr int simd_len = 16; + constexpr int pack_iw_len = 16; + constexpr int src_reg = 8; + constexpr int weight_reg = 2; + + const int ic_stride = ih * iw * pack_iw_len; + const int ld_weight_oc = oc_step * filter_height * filter_width * ic; + constexpr int c_dim = OCHelper::val; + int32x4_t c[c_dim][8]; + init_ocx_ow8(c, bias_ptr, oc_step); + + for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { + const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; + int8x16_t src[src_reg]; + int8x16_t dot4_weight[c_dim][weight_reg]; + int16x8_t temp_c[4]; +#define cb(step) \ + load_helper( \ + dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc); \ + load_helper( \ + src, nchw_src_ptr + step * iw * pack_iw_len, 0); \ + cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c); \ + load_helper<4, 0, simd_len, 0, Vld1q_s8>( \ + src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \ + cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c); + + UNROLL_CALL_RAW(7, cb); +#undef cb + weight_ptr += oc_step * filter_height * filter_width; + } + store_ocx_ow8_remain_static_dt( + c, op, dst_ptr, ld_dst_oc); + } +}; +} // namespace + +namespace int8_direct_nchw_nchw44 { +/** + * pack {oc / 4, fh, fw, ic, 4(oc)} to {oc / 4, ic, fh ,fw/4, 4(oc)*4(fw)} + * pack interleave two adjacent row in filter to one row + * */ +template +struct ConvDiectStrideInt8NchwNchw44 { + static void impl( + const int8_t* src, const int8_t* filter, const int32_t* bias, int32_t* temp, + int8_t* dst, const size_t oc, const size_t ic, const size_t ih, + const size_t iw, const size_t oh, const size_t ow, const Op& op) { + MEGDNN_MARK_USED_VAR(temp); + constexpr int stride = 1; + constexpr size_t fh = filter_size; + constexpr size_t fw = (filter_size + 3) / 4 * 4; + constexpr size_t ic_step = 1; + constexpr size_t big_oc_step = 8; + constexpr size_t oc_step = 4; + constexpr size_t ih_step = 1; + constexpr size_t oh_step = 1; + constexpr size_t ow_step = 8; + constexpr size_t stride_h = stride; + constexpr size_t stride_w = stride; + constexpr int pack_iw_len = 16; + + const size_t img_stride = oh * ow; + const size_t ow_end = ow / ow_step * ow_step; + const size_t ow_remain = ow - ow_end; + const size_t oc_end = oc / big_oc_step * big_oc_step; + const size_t oc_remain = oc - oc_end; + const int ld_dst_oc = oc_step * img_stride; + + using remain_fun = std::function; + remain_fun kern_big_oc_remain = nullptr; + remain_fun kern_small_oc_remain = nullptr; + switch (ow_remain) { +#define cb(step) \ + case step: \ + kern_big_oc_remain = KerNeonXXs2NchwNchw44< \ + bias_mode, Op, step, filter_size, big_oc_step, stride>::impl; \ + kern_small_oc_remain = KerNeonXXs2NchwNchw44< \ + bias_mode, Op, step, filter_size, oc_step, stride>::impl; \ + break; + + UNROLL_CALL_RAW(8, cb); + default: + megdnn_assert(0, "no remain %zu for kern", ow_remain); + } + + for (size_t oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) { + const size_t weight_offset = oc_idx * ic * fh * fw; + for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) { + for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { + const size_t src_offset = + (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * + ic_step * pack_iw_len; + const size_t dst_offset = + oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; + + KerNeonXXs2NchwNchw44< + bias_mode, Op, ow_step, filter_size, big_oc_step, stride>:: + impl(src + src_offset, filter + weight_offset, + bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc, + op); + } + if (ow_remain > 0) { + const size_t src_offset = + (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * + ic_step * pack_iw_len; + const size_t dst_offset = + oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; + kern_big_oc_remain( + src + src_offset, filter + weight_offset, bias + oc_idx, + dst + dst_offset, ic, ih, iw, ld_dst_oc, op); + } + } + } + + if (oc_remain > 0) { + size_t oc_idx = oc_end; + const size_t weight_offset = oc_idx * ic * fh * fw; + for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) { + for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { + const size_t src_offset = + (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * + ic_step * pack_iw_len; + const size_t dst_offset = + oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; + KerNeonXXs2NchwNchw44< + bias_mode, Op, ow_step, filter_size, oc_step, stride>:: + impl(src + src_offset, filter + weight_offset, + bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc, + op); + } + if (ow_remain > 0) { + const size_t src_offset = + (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * + ic_step * pack_iw_len; + const size_t dst_offset = + oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; + kern_small_oc_remain( + src + src_offset, filter + weight_offset, bias + oc_idx, + dst + dst_offset, ic, ih, iw, ld_dst_oc, op); + } + } + } + } +}; + +#define INSTANCE_CONV_KERN_FUN(stride, filter_size, bias_mode, Op) \ + template struct megdnn::arm_common::int8_direct_nchw_nchw44:: \ + ConvDiectStrideInt8NchwNchw44; + +#define INSTANCE_OP_PARAM(stride, filter, bias_mode) \ + INSTANCE_CONV_KERN_FUN( \ + stride, filter, bias_mode, TypeCvtOp) \ + INSTANCE_CONV_KERN_FUN( \ + stride, filter, bias_mode, ReluOp) \ + INSTANCE_CONV_KERN_FUN( \ + stride, filter, bias_mode, HSwishOp) + +#define INSTANCE_BIAS_MODE_PARAM(stride, filter) \ + INSTANCE_OP_PARAM(stride, filter, BiasMode::NO_BIAS) \ + INSTANCE_OP_PARAM(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS) + +#define INSTANCE_CONV_KERN(stride, filter) INSTANCE_BIAS_MODE_PARAM(stride, filter) + +} // namespace int8_direct_nchw_nchw44 +} // namespace arm_common +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp new file mode 100644 index 00000000..14763c96 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp @@ -0,0 +1,19 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h" +using namespace megdnn; +using namespace arm_common; + +INSTANCE_CONV_KERN(1, 1); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp new file mode 100644 index 00000000..10d46268 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp @@ -0,0 +1,19 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h" +using namespace megdnn; +using namespace arm_common; + +INSTANCE_CONV_KERN(1, 2); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp new file mode 100644 index 00000000..87553278 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp @@ -0,0 +1,19 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h" +using namespace megdnn; +using namespace arm_common; + +INSTANCE_CONV_KERN(1, 3); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp new file mode 100644 index 00000000..d7deb345 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp @@ -0,0 +1,19 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h" +using namespace megdnn; +using namespace arm_common; + +INSTANCE_CONV_KERN(1, 5); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp new file mode 100644 index 00000000..37cb7679 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp @@ -0,0 +1,19 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h" +using namespace megdnn; +using namespace arm_common; + +INSTANCE_CONV_KERN(1, 7); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp similarity index 83% rename from dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp index 8b959f54..e6b8576d 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -12,8 +12,5 @@ */ #include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(2, 1); -INSTANCE_CONV(3, 1); -INSTANCE_CONV(5, 1); -INSTANCE_CONV(7, 1); -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp similarity index 83% rename from dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp index 050a7df8..7f78d864 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -12,8 +12,5 @@ */ #include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(2, 2); -INSTANCE_CONV(3, 2); -INSTANCE_CONV(5, 2); -INSTANCE_CONV(7, 2); -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp new file mode 100644 index 00000000..a970478d --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp @@ -0,0 +1,16 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" +INSTANCE_CONV(3, 1); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp new file mode 100644 index 00000000..532351d8 --- /dev/null +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp @@ -0,0 +1,16 @@ +/** + * \file + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" +INSTANCE_CONV(3, 2); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp similarity index 66% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp index 2bd616d6..ffe5decc 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,5 +10,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ -#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(5, 1); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp similarity index 66% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp index 8433d0de..7a64dbe4 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,5 +10,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ -#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(5, 2); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp similarity index 66% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp index deb839a8..154903f9 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,5 +10,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ -#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(7, 1); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp similarity index 66% rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp index c0a18167..83b9e21b 100644 --- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp @@ -1,6 +1,6 @@ /** * \file - * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp + * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,5 +10,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ -#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h" +#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h" INSTANCE_CONV(7, 2); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/dropout.cpp b/dnn/src/common/dropout.cpp new file mode 100644 index 00000000..7327ca99 --- /dev/null +++ b/dnn/src/common/dropout.cpp @@ -0,0 +1,74 @@ +/** + * \file dnn/src/common/dropout.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +namespace megdnn { + +void DropoutForward::deduce_layout( + const TensorLayout& inp, TensorLayout& oup, TensorLayout& mask) { + oup = inp; + size_t mask_size = get_mask_size_in_bytes(inp); + mask = TensorLayout(TensorShape({mask_size}), dtype::Byte()); +} + +void DropoutForward::check_exec( + const TensorLayout& inp, const TensorLayout& oup, const TensorLayout& mask, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + return megdnn_layout_msg(inp) + ", " + megdnn_layout_msg(oup) + ", " + + megdnn_layout_msg(mask); + }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert_contiguous(inp); + megdnn_assert_contiguous(oup); + megdnn_assert_contiguous(mask); + megdnn_assert(inp.eq_layout(oup), "%s", errmsg().c_str()); + megdnn_assert(inp.dtype.category() == DTypeCategory::FLOAT); + + auto required_workspace_in_bytes = get_workspace_in_bytes(inp, oup, mask); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + auto required_mask_size_in_bytes = get_mask_size_in_bytes(inp); + megdnn_assert(mask.total_nr_elems() >= required_mask_size_in_bytes); + megdnn_assert(mask.dtype == dtype::Byte()); +} + +void DropoutBackward::deduce_layout( + const TensorLayout& doup, const TensorLayout&, TensorLayout& dinp) { + dinp = doup; +} + +void DropoutBackward::check_exec( + const TensorLayout& doup, const TensorLayout& mask, const TensorLayout& dinp, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + return megdnn_layout_msg(doup) + ", " + megdnn_layout_msg(mask) + ", " + + megdnn_layout_msg(dinp); + }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert_contiguous(doup); + megdnn_assert_contiguous(mask); + megdnn_assert_contiguous(dinp); + megdnn_assert(doup.eq_layout(dinp), "%s", errmsg().c_str()); + + auto required_workspace_in_bytes = get_workspace_in_bytes(doup, mask, dinp); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + megdnn_assert(doup.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(mask.dtype == dtype::Byte()); + megdnn_assert(mask.ndim == 1); +} + +} // namespace megdnn diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h index 7c3e01a1..ff030f25 100644 --- a/dnn/src/common/handle_impl.h +++ b/dnn/src/common/handle_impl.h @@ -209,7 +209,11 @@ private: cb(LSQBackward) \ cb(Fill) \ cb(PaddingForward) \ - cb(PaddingBackward) + cb(PaddingBackward) \ + cb(LayerNormForward) \ + cb(LayerNormBackward) \ + cb(DropoutForward) \ + cb(DropoutBackward) // clang-format on /*! diff --git a/dnn/src/common/layer_norm.cpp b/dnn/src/common/layer_norm.cpp new file mode 100644 index 00000000..44bb16e1 --- /dev/null +++ b/dnn/src/common/layer_norm.cpp @@ -0,0 +1,180 @@ +/** + * \file dnn/src/common/layer_norm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void LayerNormBase::deduce_layout_fwd( + const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias, + TensorLayout& dst, TensorLayout& mean, TensorLayout& rstd) { + MEGDNN_MARK_USED_VAR(weight); + MEGDNN_MARK_USED_VAR(bias); + auto p = param(); + TensorShape unnormalized_shape; + unnormalized_shape.ndim = data.ndim - p.normalized_dim; + for (size_t i = 0; i < unnormalized_shape.ndim; ++i) { + unnormalized_shape.shape[i] = data.shape[i]; + } + TensorLayout unnormalized_layout = + TensorLayout(unnormalized_shape, dtype::Float32()); + dst = data; + mean = unnormalized_layout; + rstd = unnormalized_layout; +} + +void LayerNormBase::check_layout_fwd( + const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias, + const TensorLayout& dst, const TensorLayout& mean, const TensorLayout& rstd) { + megdnn_assert_contiguous(data); + megdnn_assert_contiguous(weight); + megdnn_assert_contiguous(bias); + megdnn_assert_contiguous(dst); + megdnn_assert_contiguous(mean); + megdnn_assert_contiguous(rstd); + auto errmsg = [&]() { + return megdnn_layout_msg(data) + ", " + megdnn_layout_msg(weight) + ", " + + megdnn_layout_msg(bias) + ", " + megdnn_layout_msg(dst) + ", " + + megdnn_layout_msg(mean) + ", " + megdnn_layout_msg(rstd); + }; + MEGDNN_MARK_USED_VAR(errmsg); + + auto equal_layout = [](const TensorLayout& lhs, const TensorLayout& rhs) -> bool { + if (!(lhs.ndim == rhs.ndim && lhs.dtype == rhs.dtype && + lhs.format == rhs.format)) + return false; + for (size_t i = 0; i < lhs.ndim; ++i) { + if (lhs.shape[i] != rhs.shape[i] || lhs.stride[i] != rhs.stride[i]) { + return false; + } + } + return true; + }; + + megdnn_assert(equal_layout(data, dst), "%s", errmsg().c_str()); + megdnn_assert(equal_layout(weight, bias), "%s", errmsg().c_str()); + megdnn_assert(equal_layout(mean, rstd), "%s", errmsg().c_str()); + + auto p = param(); + uint64_t normalized_dim = p.normalized_dim; + size_t unnormalized_dim = data.ndim - normalized_dim; + megdnn_assert( + normalized_dim < data.ndim, + "the dims of normalized shape should smaller than input dims"); + + for (size_t i = 0; i < unnormalized_dim; ++i) { + megdnn_assert(data.shape[i] == mean.shape[i], "%s", errmsg().c_str()); + } + if (p.affine) { + for (size_t i = 0; i < normalized_dim; ++i) { + megdnn_assert( + data.shape[unnormalized_dim + i] == weight.shape[i], "%s", + errmsg().c_str()); + } + } +} + +void LayerNormForward::deduce_layout( + const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias, + TensorLayout& dst, TensorLayout& mean, TensorLayout& rstd) { + deduce_layout_fwd(data, weight, bias, dst, mean, rstd); +} + +void LayerNormForward::check_exec( + const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias, + const TensorLayout& dst, const TensorLayout& mean, const TensorLayout& rstd, + size_t workspace_in_bytes) { + check_layout_fwd(data, weight, bias, dst, mean, rstd); + auto required_workspace_in_bytes = + get_workspace_in_bytes(data, weight, bias, dst, mean, rstd); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LayerNormBackward::deduce_layout( + const TensorLayout& diff, const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& mean, const TensorLayout& rstd, TensorLayout& ddata, + TensorLayout& dweight, TensorLayout& dbias) { + MEGDNN_MARK_USED_VAR(diff); + MEGDNN_MARK_USED_VAR(mean); + MEGDNN_MARK_USED_VAR(rstd); + ddata = data; + dweight = weight; + dbias = weight; +} + +void LayerNormBackward::check_exec( + const TensorLayout& diff, const TensorLayout& data, const TensorLayout& weight, + const TensorLayout& mean, const TensorLayout& rstd, const TensorLayout& ddata, + const TensorLayout& dweight, const TensorLayout& dbias, + size_t workspace_in_bytes) { + auto p = param(); + auto required_workspace_in_bytes = get_workspace_in_bytes( + diff, data, weight, mean, rstd, ddata, dweight, dbias); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + + megdnn_assert_contiguous(diff); + megdnn_assert_contiguous(data); + megdnn_assert_contiguous(mean); + megdnn_assert_contiguous(rstd); + megdnn_assert_contiguous(ddata); + if (p.affine) { + megdnn_assert_contiguous(weight); + megdnn_assert_contiguous(dweight); + megdnn_assert_contiguous(dbias); + } + + auto errmsg = [&]() { + return megdnn_layout_msg(diff) + ", " + megdnn_layout_msg(data) + ", " + + megdnn_layout_msg(weight) + ", " + megdnn_layout_msg(mean) + ", " + + megdnn_layout_msg(rstd) + ", " + megdnn_layout_msg(ddata) + ", " + + megdnn_layout_msg(dweight) + ", " + megdnn_layout_msg(dbias); + }; + MEGDNN_MARK_USED_VAR(errmsg); + + auto equal_layout = [](const TensorLayout& lhs, const TensorLayout& rhs) -> bool { + if (!(lhs.ndim == rhs.ndim && lhs.dtype == rhs.dtype && + lhs.format == rhs.format)) + return false; + for (size_t i = 0; i < lhs.ndim; ++i) { + if (lhs.shape[i] != rhs.shape[i] || lhs.stride[i] != rhs.stride[i]) { + return false; + } + } + return true; + }; + + megdnn_assert(equal_layout(data, ddata), "%s", errmsg().c_str()); + megdnn_assert(equal_layout(mean, rstd), "%s", errmsg().c_str()); + if (p.affine) { + megdnn_assert(equal_layout(weight, dweight), "%s", errmsg().c_str()); + megdnn_assert(equal_layout(weight, dbias), "%s", errmsg().c_str()); + } + + size_t normalized_dim = p.normalized_dim; + size_t unnormalized_dim = data.ndim - normalized_dim; + + for (size_t i = 0; i < unnormalized_dim; ++i) { + megdnn_assert(data.shape[i] == mean.shape[i], "%s", errmsg().c_str()); + } + if (p.affine) { + for (size_t i = 0; i < normalized_dim; ++i) { + megdnn_assert( + data.shape[unnormalized_dim + i] == weight.shape[i], "%s", + errmsg().c_str()); + } + } +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/opr_trait.h b/dnn/src/common/opr_trait.h index 8999b736..8b6145a4 100644 --- a/dnn/src/common/opr_trait.h +++ b/dnn/src/common/opr_trait.h @@ -135,6 +135,10 @@ DEF(CheckNonFinite, 2, true, true); DEF(LSQForward, 5, true, true); DEF(LSQBackward, 7, true, false); DEF(Fill, 1, true, false); +DEF(LayerNormForward, 6, true, true); +DEF(LayerNormBackward, 8, true, true); +DEF(DropoutForward, 3, true, true); +DEF(DropoutBackward, 3, true, true); } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/common/pooling.cpp b/dnn/src/common/pooling.cpp index 5d99b4d8..b2fed318 100644 --- a/dnn/src/common/pooling.cpp +++ b/dnn/src/common/pooling.cpp @@ -93,7 +93,7 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst) size_t ph = this->param().pad_h; size_t pw = this->param().pad_w; if (ph >= fh || pw >= fw) { - megdnn_log_error( + megdnn_log_warn( "pooling padding size (%zu %zu) should not be bigger than " "window size (%zu %zu), it only can be used in CaffePooling", pw, ph, fw, fh); diff --git a/dnn/src/cuda/dropout/opr_impl.cpp b/dnn/src/cuda/dropout/opr_impl.cpp new file mode 100644 index 00000000..d7349114 --- /dev/null +++ b/dnn/src/cuda/dropout/opr_impl.cpp @@ -0,0 +1,118 @@ +/** + * \file dnn/src/cuda/dropout/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/dropout/opr_impl.h" + +namespace megdnn { +namespace cuda { + +using Param = megdnn::Dropout::Param; + +struct DropoutTensorDesc : public TensorDesc { +public: + DropoutTensorDesc(const TensorLayout& layout) : TensorDesc() { + set_dropout_desc(layout); + } + void set_dropout_desc(const TensorLayout& layout) { + cudnnDataType_t cudnn_dtype; + switch (layout.dtype.enumv()) { + case DTypeEnum::Float32: + cudnn_dtype = CUDNN_DATA_FLOAT; + break; + case DTypeEnum::Float16: + cudnn_dtype = CUDNN_DATA_HALF; + break; + default: + megdnn_throw("dtype must be float16/float32"); + } + cudnn_check(cudnnSetTensor4dDescriptor( + desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, 1, 1, + layout.total_nr_elems())); + } +}; + +size_t DropoutForwardImpl::get_mask_size_in_bytes(const TensorLayout& inp) { + size_t reserve_space_size_in_bytes = 0; + DropoutTensorDesc ddesc(inp); + cudnn_check( + cudnnDropoutGetReserveSpaceSize(ddesc.desc, &reserve_space_size_in_bytes)); + return reserve_space_size_in_bytes; +} + +void DropoutForwardImpl::exec( + _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask, + _megdnn_workspace workspace) { + check_exec(inp.layout, oup.layout, mask.layout, workspace.size); + uint64_t seed = param().seed; + float drop_prob = param().drop_prob; + + if (!dropout_status.initialized()) { + dropout_status.set(cudnn_handle(this->handle()), seed, drop_prob); + } + if (dropout_status.drop_prob != drop_prob) { + dropout_status.drop_prob = drop_prob; + dropout_status.restore_desc(cudnn_handle(this->handle())); + } + megdnn_assert(dropout_status.seed == seed); + + DropoutTensorDesc inp_desc(inp.layout), oup_desc(oup.layout); + auto&& op_desc = dropout_status.desc; + + cudnn_check(cudnnDropoutForward( + cudnn_handle(this->handle()), op_desc.desc, inp_desc.desc, inp.raw_ptr(), + oup_desc.desc, oup.raw_ptr(), mask.raw_ptr(), + mask.layout.total_nr_elems())); +} + +void DropoutBackwardImpl::exec( + _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp, + _megdnn_workspace workspace) { + check_exec(doup.layout, mask.layout, dinp.layout, workspace.size); + +#if CUDNN_VERSION >= 7000 + size_t status_size_in_bytes = 0; + cudnn_check(cudnnDropoutGetStatesSize( + cudnn_handle(this->handle()), &status_size_in_bytes)); + + DropoutTensorDesc doup_desc(doup.layout), dinp_desc(dinp.layout); + op_desc.restore( + cudnn_handle(this->handle()), param().drop_prob, nullptr, + status_size_in_bytes, 0); + cudnn_check(cudnnDropoutBackward( + cudnn_handle(this->handle()), op_desc.desc, doup_desc.desc, doup.raw_ptr(), + dinp_desc.desc, dinp.raw_ptr(), mask.raw_ptr(), + mask.layout.total_nr_elems())); +#else + uint64_t seed = param().seed; + float drop_prob = param().drop_prob; + + if (!dropout_status.initialized()) { + dropout_status.set(cudnn_handle(this->handle()), seed, drop_prob); + } + if (dropout_status.drop_prob != drop_prob) { + dropout_status.drop_prob = drop_prob; + dropout_status.restore_desc(cudnn_handle(this->handle())); + } + + auto&& op_desc = dropout_status.desc; + DropoutTensorDesc doup_desc(doup.layout), dinp_desc(dinp.layout); + + cudnn_check(cudnnDropoutBackward( + cudnn_handle(this->handle()), op_desc.desc, doup_desc.desc, doup.raw_ptr(), + dinp_desc.desc, dinp.raw_ptr(), mask.raw_ptr(), + mask.layout.total_nr_elems())); +#endif +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/dropout/opr_impl.h b/dnn/src/cuda/dropout/opr_impl.h new file mode 100644 index 00000000..4db5df47 --- /dev/null +++ b/dnn/src/cuda/dropout/opr_impl.h @@ -0,0 +1,116 @@ +/** + * \file dnn/src/cuda/dropout/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class DropoutDesc { +public: + DropoutDesc() { cudnn_check(cudnnCreateDropoutDescriptor(&desc)); } + ~DropoutDesc() { cudnn_check(cudnnDestroyDropoutDescriptor(desc)); } + void set( + cudnnHandle_t handle, void* status, size_t states_size_in_bytes, + uint64_t seed, float drop_prob) { + cudnn_check(cudnnSetDropoutDescriptor( + desc, handle, drop_prob, status, states_size_in_bytes, seed)); + } + void restore( + cudnnHandle_t handle, float drop_prob, void* status, + size_t states_size_in_bytes, uint64_t seed) { +#if CUDNN_VERSION >= 7000 + cudnn_check(cudnnRestoreDropoutDescriptor( + desc, handle, drop_prob, status, states_size_in_bytes, 0)); +#else + // cudnnDropoutRestore is not support when cudnn version < 7000 + // so we set the dropoutDesc rather than restore + cudnn_check(cudnnSetDropoutDescriptor( + desc, handle, drop_prob, status, states_size_in_bytes, seed)); +#endif + } + cudnnDropoutDescriptor_t desc; +}; + +class DropoutStatus { + void* status; + uint64_t status_size; + uint64_t seed; + float drop_prob; + DropoutDesc desc; + +public: + DropoutStatus() { + status = nullptr; + status_size = 0; + } + ~DropoutStatus() { + if (status != nullptr) + cuda_check(cudaFree(status)); + } + void set(cudnnHandle_t handle, uint64_t seed, float drop_prob) { + this->seed = seed; + this->drop_prob = drop_prob; + cudnn_check(cudnnDropoutGetStatesSize(handle, &status_size)); + cuda_check(cudaMalloc(&status, status_size)); + desc.set(handle, status, status_size, seed, drop_prob); + } + void restore_desc(cudnnHandle_t handle) { + desc.restore(handle, drop_prob, status, status_size, seed); + } + bool initialized() { return status != nullptr; } + friend class DropoutForwardImpl; + friend class DropoutBackwardImpl; +}; + +// similar to RNG operator, dropout operator also have status +class DropoutForwardImpl final : public DropoutForward { + DropoutStatus dropout_status; + +public: + using DropoutForward::DropoutForward; + void exec( + _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask, + _megdnn_workspace workspace) override; + size_t get_mask_size_in_bytes(const TensorLayout& inp) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +class DropoutBackwardImpl final : public DropoutBackward { +#if CUDNN_VERSION >= 7000 + DropoutDesc op_desc; +#else + // cudnnDropoutRestore is not support when cudnn version < 7000 + // so we need save the dropout status and set the dropoutDesc + // rather than restore + DropoutStatus dropout_status; +#endif + +public: + using DropoutBackward::DropoutBackward; + void exec( + _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp index 39bd56b3..f00006ea 100644 --- a/dnn/src/cuda/handle.cpp +++ b/dnn/src/cuda/handle.cpp @@ -52,7 +52,9 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle) // Get stream from MegCore computing handle. megdnn_assert( CUDNN_VERSION == cudnnGetVersion(), - "cudnn version mismatch: compiled with %d; detected %zu at runtime", + "cudnn version mismatch: compiled with %d; detected %zu at runtime, may " + "caused by customized environment, for example LD_LIBRARY_PATH on LINUX " + "and PATH on Windows!!", CUDNN_VERSION, cudnnGetVersion()); #if CUDA_VERSION >= 10010 megdnn_assert( diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp index 03858f5f..df3c5ccb 100644 --- a/dnn/src/cuda/handle_create.cpp +++ b/dnn/src/cuda/handle_create.cpp @@ -34,6 +34,7 @@ #include "src/cuda/deformable_conv/opr_impl.h" #include "src/cuda/deformable_ps_roi_pooling/opr_impl.h" #include "src/cuda/dot/opr_impl.h" +#include "src/cuda/dropout/opr_impl.h" #include "src/cuda/elemwise/opr_impl.h" #include "src/cuda/elemwise_multi_type/opr_impl.h" #include "src/cuda/eye/opr_impl.h" @@ -45,6 +46,7 @@ #include "src/cuda/images2neibs/opr_impl.h" #include "src/cuda/indexing_multi_axis_vec/opr_impl.h" #include "src/cuda/indexing_one_hot/opr_impl.h" +#include "src/cuda/layer_norm/opr_impl.h" #include "src/cuda/linspace/opr_impl.h" #include "src/cuda/local/opr_impl.h" #include "src/cuda/local_share/opr_impl.h" diff --git a/dnn/src/cuda/layer_norm/layer_norm_cuda.cu b/dnn/src/cuda/layer_norm/layer_norm_cuda.cu new file mode 100644 index 00000000..2cca694a --- /dev/null +++ b/dnn/src/cuda/layer_norm/layer_norm_cuda.cu @@ -0,0 +1,664 @@ +/** + * \file dnn/src/cuda/layer_norm/layer_norm_cuda.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include +#include +#include +#include "megdnn/arch.h" +#include "megdnn/dtype.h" +#include "src/cuda/cuda_shfl_compat.cuh" +#include "src/cuda/layer_norm/layer_norm_cuda.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace layer_norm { + +constexpr int kCUDANumThreads = 256; +constexpr int vec_size = 4; + +// warp size may be used as array length, or used in host function, +// so we define WARP_SIZE rather than using warpSize +#define WARP_SIZE 32 + +#if defined(__clang__) +#define __ubsan_ignore_float_divide_by_zero__ \ + __attribute__((no_sanitize("float-divide-by-zero"))) +#else +#define __ubsan_ignore_float_divide_by_zero__ +#endif + +struct WelfordStat { + float mean; + float sigma2; + float count; + MEGDNN_HOST MEGDNN_DEVICE WelfordStat() : mean(0.f), sigma2(0.f), count(0.f) {} + MEGDNN_HOST MEGDNN_DEVICE WelfordStat(float mean, float sigma2, float count) + : mean(mean), sigma2(sigma2), count(count) {} +}; + +template +struct WelfordData { + T mean; + T sigma2; + combine_t count; + + MEGDNN_HOST MEGDNN_DEVICE WelfordData() : mean(0), sigma2(0), count(0) {} + + MEGDNN_HOST MEGDNN_DEVICE WelfordData(T mean, T sigma2, combine_t count) + : mean(mean), sigma2(sigma2), count(count) {} +}; + +template +struct WelfordOps { +public: + using WelfordData_T = WelfordData; + inline MEGDNN_DEVICE WelfordData_T reduce(WelfordData_T acc, T data) const { + T delta = data - acc.mean; + T new_mean = static_cast(acc.mean + delta / (acc.count + 1)); + T new_delta = static_cast(data - new_mean); + return { + new_mean, + acc.sigma2 + delta * new_delta, + combine_t(acc.count + 1), + }; + } + inline MEGDNN_DEVICE WelfordData_T + combine(WelfordData_T lhs, WelfordData_T rhs) const { + if (lhs.count != 0 && rhs.count != 0) { + T delta = rhs.mean - lhs.mean; + combine_t new_count = lhs.count + rhs.count; + T nb_over_n = rhs.count / new_count; + return {lhs.mean + delta * nb_over_n, + lhs.sigma2 + rhs.sigma2 + delta * delta * lhs.count * nb_over_n, + new_count}; + } else { + return (lhs.count != 0) ? lhs : rhs; + } + } + inline MEGDNN_DEVICE res_t + project(WelfordData_T acc) const __ubsan_ignore_float_divide_by_zero__ { + const auto mean = static_cast(acc.mean); + const combine_t divisor = static_cast(acc.count); + const auto var = acc.sigma2 / divisor; + res_t results(var, mean); + return results; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline MEGDNN_DEVICE WelfordData_T + warp_shfl_down(WelfordData_T acc, int offset) const { + return {__shfl_down(acc.mean, offset, warpSize), + __shfl_down(acc.sigma2, offset, warpSize), + __shfl_down(acc.count, offset, warpSize)}; + } +#endif + MEGDNN_HOST MEGDNN_DEVICE WelfordOps() {} +}; + +template +struct alignas(sizeof(T) * vec_size) aligned_vector { + T val[vec_size]; +}; + +template +using acc_type = T; + +template +MEGDNN_DEVICE WelfordStat +update_welford_stat_online(const U val, const WelfordStat& curr_sum) { + U delta = static_cast(val - curr_sum.mean); + U new_count = static_cast(curr_sum.count + 1.f); + U new_mean = static_cast(curr_sum.mean + delta * (1.f / new_count)); + return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count}; +} + +MEGDNN_DEVICE WelfordStat +combine_welford_stat(const WelfordStat lhs, const WelfordStat rhs) { + using U = decltype(lhs.count); + U delta = lhs.mean - rhs.mean; + U count = rhs.count + lhs.count; + U mean, sigma2; + if (count > decltype(lhs.count){0}) { + auto coef = 1.f / count; + auto nA = rhs.count * coef; + auto nB = lhs.count * coef; + mean = nA * rhs.mean + nB * lhs.mean; + sigma2 = rhs.sigma2 + lhs.sigma2 + delta * delta * rhs.count * nB; + } else { + mean = U(0); + sigma2 = U(0); + } + return {mean, sigma2, count}; +} + +template +MEGDNN_DEVICE WelfordStat +compute_stats(const T* __restrict__ X, const int slice_len, float* buf) { + using vec_t = aligned_vector; + using acc_t = acc_type; + const vec_t* X_vec = reinterpret_cast(X); + const int numx = blockDim.x * blockDim.y; + const int thrx = threadIdx.x + threadIdx.y * blockDim.x; + const int n_vec_to_read = slice_len / vec_size; + WelfordStat w_stat(0.f, 0.f, 0.f); + for (int i = thrx; i < n_vec_to_read; i += numx) { + vec_t data = X_vec[i]; +#pragma unroll + for (int ii = 0; ii < vec_size; ii++) { + w_stat = update_welford_stat_online( + static_cast(data.val[ii]), w_stat); + } + } + // intra-warp reduction +#pragma unroll + for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) { + WelfordStat w_tmp{ + __shfl_down(w_stat.mean, offset, warpSize), + __shfl_down(w_stat.sigma2, offset, warpSize), + __shfl_down(w_stat.count, offset, warpSize)}; + w_stat = combine_welford_stat(w_stat, w_tmp); + } + + // threadIdx.x == 0 has correct values for each warp + // inter-warp reductions + if (blockDim.y > 1) { + float* mean_sigma_buf = buf; + float* count_buf = buf + blockDim.y; + for (int offset = blockDim.y / 2; offset > 0; offset /= 2) { + // upper half of warps write to shared + if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) { + const int wrt_y = threadIdx.y - offset; + mean_sigma_buf[2 * wrt_y] = w_stat.mean; + mean_sigma_buf[2 * wrt_y + 1] = w_stat.sigma2; + count_buf[wrt_y] = w_stat.count; + } + __syncthreads(); + + // lower half merges + if (threadIdx.x == 0 && threadIdx.y < offset) { + WelfordStat w_tmp{ + mean_sigma_buf[2 * threadIdx.y], + mean_sigma_buf[2 * threadIdx.y + 1], count_buf[threadIdx.y]}; + w_stat = combine_welford_stat(w_stat, w_tmp); + } + __syncthreads(); + } + if (threadIdx.x == 0 && threadIdx.y == 0) { + mean_sigma_buf[0] = w_stat.mean; + mean_sigma_buf[1] = w_stat.sigma2 / float(slice_len); + } + __syncthreads(); + return WelfordStat{mean_sigma_buf[0], mean_sigma_buf[1], 0.f}; + + } else { + return WelfordStat{ + __shfl(w_stat.mean, 0, warpSize), + __shfl(w_stat.sigma2, 0, warpSize) / float(slice_len), 0.f}; + } +} + +template +__global__ void vectorized_layer_norm_forward_affine_kernel( + const int slice_len, T_ACC eps, const T* __restrict__ X, const T* weight, + const T* bias, T_ACC* mean, T_ACC* rstd, T* Y) { + // if we made smem WelfordStat type, there would be bank conflicts, + // as one thread would have to write 3 consecutive floats + extern __shared__ float s_data[]; + + auto slice_id = blockIdx.x; + const T* slice = X + slice_id * slice_len; + WelfordStat slice_w_stat = compute_stats(slice, slice_len, s_data); + using vec_t = aligned_vector; + const vec_t* X_vec = reinterpret_cast(slice); + vec_t* Y_vec = reinterpret_cast(Y + slice_id * slice_len); + const int numx = blockDim.x * blockDim.y; + const int thrx = threadIdx.x + threadIdx.y * blockDim.x; + const int n_vec_to_read = slice_len / vec_size; + T_ACC rstd_val = static_cast(rsqrt(slice_w_stat.sigma2 + eps)); + + for (int i = thrx; i < n_vec_to_read; i += numx) { + vec_t data = X_vec[i]; + vec_t out; + // computation is performed in T_ACC, X is cast to T_ACC and result is + // implicitly cast to T + +#pragma unroll + for (int ii = 0; ii < vec_size; ii++) { + out.val[ii] = static_cast(weight[i * vec_size + ii]) * + (rstd_val * (static_cast(data.val[ii]) - + slice_w_stat.mean)) + + static_cast(bias[i * vec_size + ii]); + } + Y_vec[i] = out; + } + if (thrx == 0) { + mean[slice_id] = slice_w_stat.mean; + rstd[slice_id] = rstd_val; + } +} + +template +__global__ void vectorized_layer_norm_forward_kernel( + const int slice_len, T_ACC eps, const T* __restrict__ X, const T* weight, + const T* bias, T_ACC* mean, T_ACC* rstd, T* Y) { + extern __shared__ float s_data[]; + + auto slice_id = blockIdx.x; + const T* slice = X + slice_id * slice_len; + WelfordStat slice_w_stat = compute_stats(slice, slice_len, s_data); + using vec_t = aligned_vector; + const vec_t* X_vec = reinterpret_cast(slice); + vec_t* Y_vec = reinterpret_cast(Y + slice_id * slice_len); + const int numx = blockDim.x * blockDim.y; + const int thrx = threadIdx.x + threadIdx.y * blockDim.x; + const int n_vec_to_read = slice_len / vec_size; + T_ACC rstd_val = static_cast(rsqrt(slice_w_stat.sigma2 + eps)); + + for (int i = thrx; i < n_vec_to_read; i += numx) { + vec_t data = X_vec[i]; + vec_t out; + +#pragma unroll + for (int ii = 0; ii < vec_size; ii++) { + out.val[ii] = + rstd_val * (static_cast(data.val[ii]) - slice_w_stat.mean); + } + Y_vec[i] = out; + } + if (thrx == 0) { + mean[slice_id] = slice_w_stat.mean; + rstd[slice_id] = rstd_val; + } +} + +template +void launch_vectorized_layer_norm_forward_kernel( + int64_t slice_len, int64_t slice_num, T_ACC eps, const T* X_data, + const T* weight_data, const T* bias_data, T* Y_data, T_ACC* mean_data, + T_ACC* rstd_data, cudaStream_t stream) { + const int num_threads = 128; + const dim3 threads(WARP_SIZE, num_threads / WARP_SIZE, 1); + const dim3 blocks(slice_num); + int nshared = threads.y > 1 ? threads.y * 3 / 2 * sizeof(T_ACC) : 0; + + if (weight_data == nullptr && bias_data == nullptr) { + vectorized_layer_norm_forward_kernel<<>>( + slice_len, eps, X_data, weight_data, bias_data, mean_data, rstd_data, + Y_data); + } else { + vectorized_layer_norm_forward_affine_kernel<<< + blocks, threads, nshared, stream>>>( + slice_len, eps, X_data, weight_data, bias_data, mean_data, rstd_data, + Y_data); + } + after_kernel_launch(); +} + +template +__inline__ MEGDNN_DEVICE T welford_warp_reduce(T val, const ReduceOp& op) { +#pragma unroll + for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) { + val = op.combine(val, op.warp_shfl_down(val, offset)); + } + return val; +} + +template +__inline__ MEGDNN_DEVICE T +welford_block_reduce(T val, const ReduceOp& op, const T& identity_element, T* shared) { + const int lid = threadIdx.x % warpSize; + const int wid = threadIdx.x / warpSize; + val = welford_warp_reduce(val, op); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lid] : identity_element; + if (wid == 0) { + val = welford_warp_reduce(val, op); + } + return val; +} + +template +__global__ void get_input_mean_and_rstd_kernel( + int64_t slice_len, T_ACC eps, const T* X, T_ACC* mean, T_ACC* rstd) { + using WelfordType = WelfordData; + using WelfordOp = WelfordOps>; + + __shared__ typename std::aligned_storage< + sizeof(WelfordType), alignof(WelfordType)>::type val_shared[WARP_SIZE]; + WelfordType* val_shared_ptr = reinterpret_cast(val_shared); + + const int64_t i = blockIdx.x; + WelfordOp welford_op; + WelfordType val( + static_cast(0), static_cast(0), static_cast(0)); + + for (int64_t j = threadIdx.x; j < slice_len; j += blockDim.x) { + const int64_t index = i * slice_len + j; + val = welford_op.reduce(val, static_cast(X[index])); + } + val = welford_block_reduce( + val, welford_op, + WelfordType( + static_cast(0), static_cast(0), + static_cast(0)), + val_shared_ptr); + + if (threadIdx.x == 0) { + T_ACC slice_mean; + T_ACC slice_sigma2; + thrust::tie(slice_sigma2, slice_mean) = welford_op.project(val); + mean[i] = slice_mean; + rstd[i] = rsqrt(slice_sigma2 + eps); + } +} + +template +__global__ void layer_norm_forward_kernel( + int64_t slice_len, const T* X, const T_ACC* mean, const T_ACC* rstd, + const T* weight, const T* bias, T* Y) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < slice_len; j += blockDim.x) { + const int64_t index = i * slice_len + j; + const T_ACC weight_v = + weight == nullptr ? T_ACC(1) : static_cast(weight[j]); + const T_ACC bias_v = bias == nullptr ? T_ACC(0) : static_cast(bias[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * weight_v + + bias_v; + } +} + +template +void forward( + T* X, T* weight, T* bias, int64_t slice_num, int64_t slice_len, T_ACC eps, T* Y, + T_ACC* mean, T_ACC* rstd, cudaStream_t stream) { + auto can_vectorize = [&](const T* ptr, int alignment) { + uint64_t addr = reinterpret_cast(ptr); + return addr % alignment == 0; + }; + constexpr int num_vec_elems = vec_size; + constexpr int alignment = num_vec_elems * sizeof(T); + if ((std::is_same::value || std::is_same::value || + std::is_same::value) && + slice_len <= static_cast(1ULL << std::numeric_limits::digits) && + slice_len % num_vec_elems == 0 && can_vectorize(X, alignment) && + can_vectorize(Y, alignment)) { + launch_vectorized_layer_norm_forward_kernel( + slice_len, slice_num, static_cast(eps), X, weight, bias, Y, mean, + rstd, stream); + after_kernel_launch(); + } else { + get_input_mean_and_rstd_kernel + <<>>(slice_len, eps, X, mean, rstd); + after_kernel_launch(); + layer_norm_forward_kernel<<>>( + slice_len, X, mean, rstd, weight, bias, Y); + after_kernel_launch(); + } +} + +template +__inline__ MEGDNN_DEVICE T warp_reduce_sum(T val) { +#pragma unroll + for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) { + val += __shfl_down(val, offset, warpSize); + } + return val; +} + +template +__inline__ MEGDNN_DEVICE T block_reduce_sum(T val, T* shared) { + const int lid = threadIdx.x % warpSize; + const int wid = threadIdx.x / warpSize; + val = warp_reduce_sum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lid] : T(0); + if (wid == 0) { + val = warp_reduce_sum(val); + } + return val; +} + +template +__inline__ MEGDNN_DEVICE void layer_norm_grad_input_kernel_impl( + const T* __restrict__ dY, const T* __restrict__ X, + const T_ACC* __restrict__ mean, const T_ACC* __restrict__ rstd, + const T* __restrict__ weight, T* dX, const int slice_len, T_ACC* buf) { + const auto slice_id = blockIdx.x; + const T_ACC mean_val = mean[slice_id]; + const T_ACC rstd_val = rstd[slice_id]; + T_ACC stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + const T* X_i = X + slice_id * slice_len; + const T* dY_i = dY + slice_id * slice_len; + T* dX_i = dX + slice_id * slice_len; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < slice_len; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T_ACC weight_val = + (weight != nullptr) ? static_cast(weight[l + k]) : T_ACC(1); + const T_ACC c_h = static_cast(X_i[l + k]); + const T_ACC c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * weight_val; + stats_x2 += c_loss * weight_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < slice_len; l++) { + T_ACC weight_val = + (weight != nullptr) ? static_cast(weight[l]) : T_ACC(1); + const T_ACC c_h = static_cast(X_i[l]); + const T_ACC c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * weight_val; + stats_x2 += c_loss * weight_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = block_reduce_sum(stats_x1, buf); + stats_x2 = block_reduce_sum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T_ACC fH = slice_len; + T_ACC term1 = (T_ACC(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < slice_len; l += blockDim.x) { + const T_ACC x = X_i[l]; + const T_ACC dy = dY_i[l]; + T_ACC weight_val = + (weight != nullptr) ? static_cast(weight[l]) : T_ACC(1); + T_ACC f_grad_input = fH * weight_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel( + const T* __restrict__ dY, const T* __restrict__ X, + const T_ACC* __restrict__ mean, const T_ACC* __restrict__ rstd, + const T* __restrict__ weight, T* dX, const int slice_len) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T_ACC* buf = reinterpret_cast(&s_data1); + + layer_norm_grad_input_kernel_impl(dY, X, mean, rstd, weight, dX, slice_len, buf); +} + +template +__global__ void layer_norm_grad_weight_bias_simple_kernel( + int64_t slice_num, int64_t slice_len, const T* dY, const T* X, + const T_ACC* mean, const T_ACC* rstd, T* dweight, T* dbias) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < slice_len) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < slice_num; ++i) { + const int64_t index = i * slice_len + j; + sum1 += dweight == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += dbias == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dweight != nullptr) { + dweight[j] = sum1; + } + if (dbias != nullptr) { + dbias[j] = sum2; + } + } +} + +template +__global__ void layer_norm_grad_weight_bias_kernel( + int64_t slice_num, int64_t slice_len, const T* dY, const T* X, + const T_ACC* mean, const T_ACC* rstd, T* dweight, T* dbias) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T_ACC* s_data_typed = reinterpret_cast(&s_data1); + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int unroll = 8; + T dYs[unroll]; + T Xs[unroll]; + T_ACC* means = s_data_typed; + T_ACC* rstds = s_data_typed + unroll * blockDim.y; + T_ACC dg_sum = 0; + T_ACC db_sum = 0; + if (j < slice_len) { + int bcounter; + for (bcounter = 0; bcounter < slice_num / (blockDim.y * unroll); bcounter++) { + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll; +#pragma unroll + for (int ii = 0; ii < unroll; ii++) { + if (threadIdx.x == 0) { + means[ii * blockDim.y + threadIdx.y] = mean[offset + ii]; + rstds[ii * blockDim.y + threadIdx.y] = rstd[offset + ii]; + } + dYs[ii] = dY[(offset + ii) * slice_len + j]; + Xs[ii] = X[(offset + ii) * slice_len + j]; + } + __syncthreads(); +#pragma unroll + for (int ii = 0; ii < unroll; ii++) { + dg_sum += dYs[ii] * (Xs[ii] - means[ii * blockDim.y + threadIdx.y]) * + rstds[ii * blockDim.y + threadIdx.y]; + db_sum += dYs[ii]; + } + __syncthreads(); + } + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll; + for (int ii = 0; ii < 8; ii++) { + T_ACC mean_val, rstd_val; // we don't use smem in the tail to avoid awkward + // synchronizations, perf penalty is negligible + if ((offset + ii) < slice_num) { + mean_val = mean[offset + ii]; + rstd_val = rstd[offset + ii]; + dYs[0] = dY[(offset + ii) * slice_len + j]; + Xs[0] = X[(offset + ii) * slice_len + j]; + dg_sum += dYs[0] * (Xs[0] - mean_val) * rstd_val; + db_sum += dYs[0]; + } + } + s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum; + s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] = + db_sum; + __syncthreads(); + for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) { + if (threadIdx.y < offset) { + s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += + s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + s_data_typed + [blockDim.x * blockDim.y + threadIdx.y * blockDim.x + + threadIdx.x] += s_data_typed + [blockDim.x * blockDim.y + + (threadIdx.y + offset) * blockDim.x + threadIdx.x]; + } + __syncthreads(); + } + if (threadIdx.y == 0) { + if (dweight) { + dweight[j] = s_data_typed[threadIdx.x]; + } + if (dbias) { + dbias[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y]; + } + } + } +} + +template +void backward( + const T* dY_data, const T* X_data, const T_ACC* mean_data, + const T_ACC* rstd_data, const T* weight_data, int64_t slice_num, + int64_t slice_len, T* dX_data, T* dweight_data, T* dbias_data, + cudaStream_t stream) { + if (dX_data != nullptr) { + const int num_threads = 128; + const dim3 blocks(slice_num); + int nshared = (num_threads / WARP_SIZE) * sizeof(T_ACC); + layer_norm_grad_input_kernel<<>>( + dY_data, X_data, mean_data, rstd_data, weight_data, dX_data, slice_len); + after_kernel_launch(); + } + if (dweight_data || dbias_data) { + if (slice_num < 512) { + const int64_t B = (slice_len + kCUDANumThreads - 1) / kCUDANumThreads; + layer_norm_grad_weight_bias_simple_kernel + <<>>( + slice_num, slice_len, dY_data, X_data, mean_data, rstd_data, + dweight_data, dbias_data); + after_kernel_launch(); + } else { + dim3 threads{16, 32}; + int blocks = (slice_len + threads.x - 1) / threads.x; + layer_norm_grad_weight_bias_kernel + <<>>( + slice_num, slice_len, dY_data, X_data, mean_data, rstd_data, + dweight_data, dbias_data); + after_kernel_launch(); + } + } +} + +#define INST(T, T_ACC) \ + template void forward( \ + T*, T*, T*, int64_t, int64_t, T_ACC, T*, T_ACC*, T_ACC*, cudaStream_t); \ + template void backward( \ + const T*, const T*, const T_ACC*, const T_ACC*, const T*, int64_t, \ + int64_t, T*, T*, T*, cudaStream_t); + +INST(dt_float32, dt_float32) +INST(dt_float16, dt_float32) +INST(dt_bfloat16, dt_float32) +#undef INST + +} // namespace layer_norm +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh b/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh new file mode 100644 index 00000000..8e14de34 --- /dev/null +++ b/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh @@ -0,0 +1,34 @@ +/** + * \file dnn/src/cuda/layer_norm/layer_norm.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include + +namespace megdnn { +namespace cuda { +namespace layer_norm { + +template +void forward( + T* X, T* gamma, T* beta, int64_t M, int64_t N, T_ACC eps, T* Y, T_ACC* mean, + T_ACC* rstd, cudaStream_t stream); + +template +void backward( + const T* dY_data, const T* X_data, const T_ACC* mean_data, + const T_ACC* rstd_data, const T* gamma_data, int64_t M, int64_t N, T* dX_data, + T* dgamma_data, T* dbeta_data, cudaStream_t stream); + +} // namespace layer_norm +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/layer_norm/opr_impl.cpp b/dnn/src/cuda/layer_norm/opr_impl.cpp new file mode 100644 index 00000000..426de527 --- /dev/null +++ b/dnn/src/cuda/layer_norm/opr_impl.cpp @@ -0,0 +1,94 @@ +/** + * \file dnn/src/cuda/layer_norm/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/layer_norm/opr_impl.h" +#include "src/cuda/layer_norm/layer_norm_cuda.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void LayerNormForwardImpl::exec( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + _megdnn_workspace workspace) { + check_exec( + data.layout, weight.layout, bias.layout, dst.layout, mean.layout, + rstd.layout, workspace.size); + + auto p = param(); + float eps = p.eps; + bool affine = p.affine; + uint64_t slice_length = p.normalized_size; + uint64_t slice_dim = p.normalized_dim; + uint64_t n_slices = 1; + for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) { + n_slices = n_slices * data.layout.shape[i]; + } + + auto stream = cuda_stream(handle()); + using namespace ::megdnn::cuda::layer_norm; + +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + using T_ACC = float; \ + forward( \ + data.ptr(), affine ? weight.ptr() : nullptr, \ + affine ? bias.ptr() : nullptr, static_cast(n_slices), \ + static_cast(slice_length), static_cast(eps), \ + dst.ptr(), mean.ptr(), rstd.ptr(), stream); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +void LayerNormBackwardImpl::exec( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, + _megdnn_workspace workspace) { + check_exec( + diff.layout, data.layout, weight.layout, mean.layout, rstd.layout, + ddata.layout, dweight.layout, dbias.layout, workspace.size); + auto p = param(); + bool affine = p.affine; + uint64_t slice_length = p.normalized_size; + uint64_t slice_dim = p.normalized_dim; + uint64_t n_slices = 1; + for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) { + n_slices = n_slices * data.layout.shape[i]; + } + + auto stream = cuda_stream(handle()); + using namespace ::megdnn::cuda::layer_norm; +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + using T_ACC = float; \ + backward( \ + diff.ptr(), data.ptr(), mean.ptr(), rstd.ptr(), \ + affine ? weight.ptr() : nullptr, n_slices, slice_length, \ + ddata.ptr(), affine ? dweight.ptr() : nullptr, \ + affine ? dbias.ptr() : nullptr, stream); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/layer_norm/opr_impl.h b/dnn/src/cuda/layer_norm/opr_impl.h new file mode 100644 index 00000000..8bca6a75 --- /dev/null +++ b/dnn/src/cuda/layer_norm/opr_impl.h @@ -0,0 +1,53 @@ +/** + * \file dnn/src/cuda/layer_norm/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +class LayerNormForwardImpl final : public LayerNormForward { +public: + using LayerNormForward::LayerNormForward; + void exec( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +class LayerNormBackwardImpl final : public LayerNormBackward { +public: + using LayerNormBackward::LayerNormBackward; + void exec( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/elemwise/opr_binary_impl.cpp b/dnn/src/fallback/elemwise/opr_binary_impl.cpp new file mode 100644 index 00000000..9acda94e --- /dev/null +++ b/dnn/src/fallback/elemwise/opr_binary_impl.cpp @@ -0,0 +1,275 @@ +/** + * \file dnn/src/fallback/elemwise/opr_binary_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./opr_impl.h" + +#include "src/common/elemwise/kern_defs.cuh" +#include "src/common/utils.h" +#include "src/naive/handle.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_elemwise_binary) + +namespace megdnn { +namespace fallback { + +template +void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { + using ctype = typename DTypeTrait::ctype; + using Kern = ElemwiseKern; + + MIDOUT_BEGIN(megdnn_fallback_elemwise_binary, ctype, midout_iv(mode)) { + if (param.max_ndim == 1) { + MIDOUT_BEGIN( + megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), + midout_iv(1)) { + auto tot = param.size; + auto as = param[0].layout.stride[0], bs = param[1].layout.stride[0]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); + for (size_t i = 0; i < tot; ++i) { + dst[i] = Kern::apply(a[i * as], b[i * bs]); + } + }); + return; + } + MIDOUT_END(); + } + + if (std::min(param[0].layout.ndim, param[1].layout.ndim) > 1) { + return naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); + } + + if (param.max_ndim == 2) { + if (param[0].layout.ndim == 1) { + MIDOUT_BEGIN( + megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), + midout_iv(21)) { + auto as = param[0].layout.stride[0], + bs0 = param[1].layout.stride[0], + bs1 = param[1].layout.stride[1]; + auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); + ptrdiff_t toff = 0; + for (size_t i = 0; i < n0; ++i) { + for (size_t j = 0; j < n1; ++j) { + dst[toff] = + Kern::apply(a[as * toff], b[bs0 * i + bs1 * j]); + ++toff; + } + } + }); + return; + } + MIDOUT_END(); + } + + MIDOUT_BEGIN( + megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), + midout_iv(22)) { + megdnn_assert(param[1].layout.ndim == 1); + auto bs = param[1].layout.stride[0], as0 = param[0].layout.stride[0], + as1 = param[0].layout.stride[1]; + auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); + ptrdiff_t toff = 0; + for (size_t i = 0; i < n0; ++i) { + for (size_t j = 0; j < n1; ++j) { + dst[toff] = Kern::apply(a[as0 * i + as1 * j], b[toff * bs]); + ++toff; + } + } + }); + return; + } + MIDOUT_END(); + } + + if (param.max_ndim == 3) { + auto brd_101 = [](const TensorND& t) { + auto&& l = t.layout; + return l.ndim == 3 && l.stride[0] == 0 && l.stride[2] == 0; + }; + if (param[0].layout.ndim == 1 && brd_101(param[1])) { + MIDOUT_BEGIN( + megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), + midout_iv(31)) { + auto as = param[0].layout.stride[0], bs = param[1].layout.stride[1]; + auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1], + n2 = param[1].layout.shape[2]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); + size_t toff = 0; + for (size_t i = 0; i < n0; ++i) { + for (size_t j = 0; j < n1; ++j) { + for (size_t k = 0; k < n2; ++k) { + dst[toff] = Kern::apply(a[as * toff], b[bs * j]); + ++toff; + } + } + } + }); + return; + } + MIDOUT_END(); + } + if (param[1].layout.ndim == 1 && brd_101(param[0])) { + MIDOUT_BEGIN( + megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), + midout_iv(32)) { + auto as = param[0].layout.stride[1], bs = param[1].layout.stride[0]; + auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1], + n2 = param[0].layout.shape[2]; + auto src0 = param[0]; + auto src1 = param[1]; + auto dst_tensor = *m_dst; + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict a = static_cast(src0.raw_ptr()); + ctype* __restrict b = static_cast(src1.raw_ptr()); + ctype* __restrict dst = + static_cast(dst_tensor.raw_ptr()); + size_t toff = 0; + for (size_t i = 0; i < n0; ++i) { + for (size_t j = 0; j < n1; ++j) { + for (size_t k = 0; k < n2; ++k) { + dst[toff] = Kern::apply(a[as * j], b[bs * toff]); + ++toff; + } + } + } + }); + return; + } + MIDOUT_END(); + } + } + + naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); + } + MIDOUT_END(); +} + +#define SWITCH_DTYPE(_cat, _cb) \ + switch (m_dst->layout.dtype.enumv()) { \ + MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \ + : megdnn_throw("bad dtype"); \ + } + +template +void ElemwiseImpl::exec_BINARY_INT() { + auto param = make_elemwise_op_param<2>(); +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return binary_kern<_dt, mode>(param); + + SWITCH_DTYPE(INT, cb) + +#undef cb +} + +template +void ElemwiseImpl::exec_BINARY_FLOAT() { + auto param = make_elemwise_op_param<2>(); +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return binary_kern<_dt, mode>(param); + + SWITCH_DTYPE(FLOAT, cb) + +#undef cb +} + +#undef SWITCH_DTYPE + +#undef SWITCH_DTYPE +using Mode = param_enumv::Elemwise::Mode; +#define INST(mode) template void megdnn::fallback::ElemwiseImpl::exec_BINARY_INT() +INST(Mode::ABS_GRAD); +INST(Mode::ADD); +INST(Mode::FLOOR_DIV); +INST(Mode::MAX); +INST(Mode::MIN); +INST(Mode::MOD); +INST(Mode::MUL); +INST(Mode::SIGMOID_GRAD); +INST(Mode::SUB); +INST(Mode::SWITCH_GT0); +INST(Mode::TANH_GRAD); +INST(Mode::LT); +INST(Mode::LEQ); +INST(Mode::EQ); +INST(Mode::SHL); +INST(Mode::SHR); +INST(Mode::FUSE_ADD_RELU); +INST(Mode::RMULH); +#undef INST + +#define INST(mode) \ + template void megdnn::fallback::ElemwiseImpl::exec_BINARY_FLOAT() +INST(Mode::ABS_GRAD); +INST(Mode::ADD); +INST(Mode::FLOOR_DIV); +INST(Mode::MAX); +INST(Mode::MIN); +INST(Mode::MOD); +INST(Mode::MUL); +INST(Mode::POW); +INST(Mode::SIGMOID_GRAD); +INST(Mode::SUB); +INST(Mode::SWITCH_GT0); +INST(Mode::TANH_GRAD); +INST(Mode::TRUE_DIV); +INST(Mode::LOG_SUM_EXP); +INST(Mode::LT); +INST(Mode::LEQ); +INST(Mode::EQ); +INST(Mode::FUSE_ADD_RELU); +INST(Mode::FUSE_ADD_SIGMOID); +INST(Mode::FUSE_ADD_TANH); +INST(Mode::FAST_TANH_GRAD); +INST(Mode::ATAN2); +INST(Mode::H_SWISH_GRAD); +INST(Mode::FUSE_ADD_H_SWISH); +INST(Mode::SILU_GRAD); +INST(Mode::GELU_GRAD); +#undef INST +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/elemwise/opr_impl.cpp b/dnn/src/fallback/elemwise/opr_impl.cpp index a9fd7815..eb4b2d9c 100644 --- a/dnn/src/fallback/elemwise/opr_impl.cpp +++ b/dnn/src/fallback/elemwise/opr_impl.cpp @@ -16,8 +16,6 @@ #include "midout.h" -MIDOUT_DECL(megdnn_fallback_elemwise_unary) -MIDOUT_DECL(megdnn_fallback_elemwise_binary) MIDOUT_DECL(megdnn_fallback_elemwise_exec_UNARY_INT) MIDOUT_DECL(megdnn_fallback_elemwise_exec_UNARY_FLOAT) MIDOUT_DECL(megdnn_fallback_elemwise_exec_BINARY_INT) @@ -26,200 +24,6 @@ MIDOUT_DECL(megdnn_fallback_elemwise_exec_BINARY_FLOAT) namespace megdnn { namespace fallback { -template -void ElemwiseImpl::unary_kern(const ElemwiseOpParamN<1>& param) { - using ctype = typename DTypeTrait::ctype; - using Kern = ElemwiseKern; - MIDOUT_BEGIN(megdnn_fallback_elemwise_unary, ctype, midout_iv(mode)) { - // only specialize for the most common 1-dim case - auto tot = param.size; - auto stride = param[0].layout.stride[0]; - auto src0 = param[0]; - auto dst_tensor = *m_dst; - if (param.max_ndim == 1) { - MIDOUT_BEGIN( - megdnn_fallback_elemwise_unary, ctype, midout_iv(mode), - midout_iv(1)) { - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict src = static_cast(src0.raw_ptr()); - ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); - for (size_t i = 0; i < tot; ++i) { - dst[i] = Kern::apply(src[i * stride]); - } - }); - return; - } - MIDOUT_END(); - } - naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); - } - MIDOUT_END(); -} - -template -void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) { - using ctype = typename DTypeTrait::ctype; - using Kern = ElemwiseKern; - - MIDOUT_BEGIN(megdnn_fallback_elemwise_binary, ctype, midout_iv(mode)) { - if (param.max_ndim == 1) { - MIDOUT_BEGIN( - megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), - midout_iv(1)) { - auto tot = param.size; - auto as = param[0].layout.stride[0], bs = param[1].layout.stride[0]; - auto src0 = param[0]; - auto src1 = param[1]; - auto dst_tensor = *m_dst; - - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict a = static_cast(src0.raw_ptr()); - ctype* __restrict b = static_cast(src1.raw_ptr()); - ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); - for (size_t i = 0; i < tot; ++i) { - dst[i] = Kern::apply(a[i * as], b[i * bs]); - } - }); - return; - } - MIDOUT_END(); - } - - if (std::min(param[0].layout.ndim, param[1].layout.ndim) > 1) { - return naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); - } - - if (param.max_ndim == 2) { - if (param[0].layout.ndim == 1) { - MIDOUT_BEGIN( - megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), - midout_iv(21)) { - auto as = param[0].layout.stride[0], - bs0 = param[1].layout.stride[0], - bs1 = param[1].layout.stride[1]; - auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1]; - auto src0 = param[0]; - auto src1 = param[1]; - auto dst_tensor = *m_dst; - - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict a = static_cast(src0.raw_ptr()); - ctype* __restrict b = static_cast(src1.raw_ptr()); - ctype* __restrict dst = - static_cast(dst_tensor.raw_ptr()); - ptrdiff_t toff = 0; - for (size_t i = 0; i < n0; ++i) { - for (size_t j = 0; j < n1; ++j) { - dst[toff] = - Kern::apply(a[as * toff], b[bs0 * i + bs1 * j]); - ++toff; - } - } - }); - return; - } - MIDOUT_END(); - } - - MIDOUT_BEGIN( - megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), - midout_iv(22)) { - megdnn_assert(param[1].layout.ndim == 1); - auto bs = param[1].layout.stride[0], as0 = param[0].layout.stride[0], - as1 = param[0].layout.stride[1]; - auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1]; - auto src0 = param[0]; - auto src1 = param[1]; - auto dst_tensor = *m_dst; - - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict a = static_cast(src0.raw_ptr()); - ctype* __restrict b = static_cast(src1.raw_ptr()); - ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); - ptrdiff_t toff = 0; - for (size_t i = 0; i < n0; ++i) { - for (size_t j = 0; j < n1; ++j) { - dst[toff] = Kern::apply(a[as0 * i + as1 * j], b[toff * bs]); - ++toff; - } - } - }); - return; - } - MIDOUT_END(); - } - - if (param.max_ndim == 3) { - auto brd_101 = [](const TensorND& t) { - auto&& l = t.layout; - return l.ndim == 3 && l.stride[0] == 0 && l.stride[2] == 0; - }; - if (param[0].layout.ndim == 1 && brd_101(param[1])) { - MIDOUT_BEGIN( - megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), - midout_iv(31)) { - auto as = param[0].layout.stride[0], bs = param[1].layout.stride[1]; - auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1], - n2 = param[1].layout.shape[2]; - auto src0 = param[0]; - auto src1 = param[1]; - auto dst_tensor = *m_dst; - - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict a = static_cast(src0.raw_ptr()); - ctype* __restrict b = static_cast(src1.raw_ptr()); - ctype* __restrict dst = - static_cast(dst_tensor.raw_ptr()); - size_t toff = 0; - for (size_t i = 0; i < n0; ++i) { - for (size_t j = 0; j < n1; ++j) { - for (size_t k = 0; k < n2; ++k) { - dst[toff] = Kern::apply(a[as * toff], b[bs * j]); - ++toff; - } - } - } - }); - return; - } - MIDOUT_END(); - } - if (param[1].layout.ndim == 1 && brd_101(param[0])) { - MIDOUT_BEGIN( - megdnn_fallback_elemwise_binary, ctype, midout_iv(mode), - midout_iv(32)) { - auto as = param[0].layout.stride[1], bs = param[1].layout.stride[0]; - auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1], - n2 = param[0].layout.shape[2]; - auto src0 = param[0]; - auto src1 = param[1]; - auto dst_tensor = *m_dst; - MEGDNN_DISPATCH_CPU_KERN_OPR({ - ctype* __restrict a = static_cast(src0.raw_ptr()); - ctype* __restrict b = static_cast(src1.raw_ptr()); - ctype* __restrict dst = - static_cast(dst_tensor.raw_ptr()); - size_t toff = 0; - for (size_t i = 0; i < n0; ++i) { - for (size_t j = 0; j < n1; ++j) { - for (size_t k = 0; k < n2; ++k) { - dst[toff] = Kern::apply(a[as * j], b[bs * toff]); - ++toff; - } - } - } - }); - return; - } - MIDOUT_END(); - } - } - - naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); - } - MIDOUT_END(); -} - void ElemwiseImpl::exec(const TensorNDArray& srcs, _megdnn_tensor_out dst) { if (!dst.layout.is_contiguous()) { return naive::ElemwiseForwardImpl::exec(srcs, dst); @@ -278,62 +82,6 @@ void ElemwiseImpl::exec(const TensorNDArray& srcs, _megdnn_tensor_out dst) { naive::ElemwiseForwardImpl::exec(srcs, dst); } -#define SWITCH_DTYPE(_cat, _cb) \ - switch (m_dst->layout.dtype.enumv()) { \ - MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \ - : megdnn_throw("bad dtype"); \ - } - -template -void ElemwiseImpl::exec_UNARY_INT() { - auto param = make_elemwise_op_param<1>(); -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return unary_kern<_dt, mode>(param); - - SWITCH_DTYPE(INT, cb) - -#undef cb -} - -template -void ElemwiseImpl::exec_UNARY_FLOAT() { - auto param = make_elemwise_op_param<1>(); -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return unary_kern<_dt, mode>(param); - - SWITCH_DTYPE(FLOAT, cb) - -#undef cb -} - -template -void ElemwiseImpl::exec_BINARY_INT() { - auto param = make_elemwise_op_param<2>(); -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return binary_kern<_dt, mode>(param); - - SWITCH_DTYPE(INT, cb) - -#undef cb -} - -template -void ElemwiseImpl::exec_BINARY_FLOAT() { - auto param = make_elemwise_op_param<2>(); -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return binary_kern<_dt, mode>(param); - - SWITCH_DTYPE(FLOAT, cb) - -#undef cb -} - -#undef SWITCH_DTYPE - } // namespace fallback } // namespace megdnn diff --git a/dnn/src/fallback/elemwise/opr_unary_impl.cpp b/dnn/src/fallback/elemwise/opr_unary_impl.cpp new file mode 100644 index 00000000..af829358 --- /dev/null +++ b/dnn/src/fallback/elemwise/opr_unary_impl.cpp @@ -0,0 +1,122 @@ +/** + * \file dnn/src/fallback/elemwise/opr_unary_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./opr_impl.h" + +#include "src/common/elemwise/kern_defs.cuh" +#include "src/common/utils.h" +#include "src/naive/handle.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_elemwise_unary) + +namespace megdnn { +namespace fallback { + +template +void ElemwiseImpl::unary_kern(const ElemwiseOpParamN<1>& param) { + using ctype = typename DTypeTrait::ctype; + using Kern = ElemwiseKern; + MIDOUT_BEGIN(megdnn_fallback_elemwise_unary, ctype, midout_iv(mode)) { + // only specialize for the most common 1-dim case + auto tot = param.size; + auto stride = param[0].layout.stride[0]; + auto src0 = param[0]; + auto dst_tensor = *m_dst; + if (param.max_ndim == 1) { + MIDOUT_BEGIN( + megdnn_fallback_elemwise_unary, ctype, midout_iv(mode), + midout_iv(1)) { + MEGDNN_DISPATCH_CPU_KERN_OPR({ + ctype* __restrict src = static_cast(src0.raw_ptr()); + ctype* __restrict dst = static_cast(dst_tensor.raw_ptr()); + for (size_t i = 0; i < tot; ++i) { + dst[i] = Kern::apply(src[i * stride]); + } + }); + return; + } + MIDOUT_END(); + } + naive::ElemwiseForwardImpl::exec(*m_src, *m_dst); + } + MIDOUT_END(); +} + +#define SWITCH_DTYPE(_cat, _cb) \ + switch (m_dst->layout.dtype.enumv()) { \ + MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \ + : megdnn_throw("bad dtype"); \ + } + +template +void ElemwiseImpl::exec_UNARY_INT() { + auto param = make_elemwise_op_param<1>(); +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return unary_kern<_dt, mode>(param); + + SWITCH_DTYPE(INT, cb) + +#undef cb +} + +template +void ElemwiseImpl::exec_UNARY_FLOAT() { + auto param = make_elemwise_op_param<1>(); +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return unary_kern<_dt, mode>(param); + + SWITCH_DTYPE(FLOAT, cb) + +#undef cb +} + +#undef SWITCH_DTYPE +using Mode = param_enumv::Elemwise::Mode; +#define INST(mode) template void megdnn::fallback::ElemwiseImpl::exec_UNARY_INT(); +INST(Mode::RELU); +INST(Mode::ABS); +INST(Mode::NEGATE); +#undef INST + +#define INST(mode) \ + template void megdnn::fallback::ElemwiseImpl::exec_UNARY_FLOAT(); +INST(Mode::RELU); +INST(Mode::ABS); +INST(Mode::ACOS); +INST(Mode::ASIN); +INST(Mode::CEIL); +INST(Mode::COS); +INST(Mode::EXP); +INST(Mode::EXPM1); +INST(Mode::FLOOR); +INST(Mode::LOG); +INST(Mode::LOG1P); +INST(Mode::NEGATE); +INST(Mode::SIGMOID); +INST(Mode::SIN); +INST(Mode::TANH); +INST(Mode::FAST_TANH); +INST(Mode::ROUND); +INST(Mode::ERF); +INST(Mode::ERFINV); +INST(Mode::ERFC); +INST(Mode::ERFCINV); +INST(Mode::H_SWISH); +INST(Mode::SILU); +INST(Mode::GELU); +#undef INST +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/dropout/opr_impl.cpp b/dnn/src/naive/dropout/opr_impl.cpp new file mode 100644 index 00000000..64b359d4 --- /dev/null +++ b/dnn/src/naive/dropout/opr_impl.cpp @@ -0,0 +1,110 @@ +/** + * \file dnn/src/naive/dropout/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/naive/dropout/opr_impl.h" +#include +#include +#include +#include "src/common/utils.h" +#include "src/naive/handle.h" + +using namespace megdnn; +using namespace naive; +using namespace std; +namespace { + +using Param = megdnn::Dropout::Param; + +dt_float32 get_random_number(uint64_t x) { + union { + uint32_t i; + dt_float32 f; + } u; + u.i = (0x7F << 23) | (x >> 41); + return 2 - u.f; +} + +template +void forward( + T* inp, T* oup, void* raw_reserved, size_t len, Xoroshiro128plus& rng, + float drop_prob) { + uint8_t* reserved = reinterpret_cast(raw_reserved); + float scale = 1.0f / (1.0f - drop_prob); + for (size_t i = 0; i < len; ++i) { + float rn = get_random_number(rng()); + reserved[i] = rn < drop_prob ? 0 : 1; + oup[i] = static_cast(reserved[i] ? static_cast(inp[i]) * scale : 0.f); + } +} + +template +void backward(T* doup, T* dinp, void* raw_reserved, size_t len, float drop_prob) { + uint8_t* reserved = reinterpret_cast(raw_reserved); + float scale = 1.0f / (1.0f - drop_prob); + for (size_t i = 0; i < len; ++i) { + dinp[i] = + static_cast(reserved[i] ? static_cast(doup[i]) * scale : 0.f); + } +} + +} // namespace + +namespace megdnn { +namespace naive { + +size_t DropoutForwardImpl::get_mask_size_in_bytes(const TensorLayout& inp) { + return inp.total_nr_elems(); +} + +void DropoutForwardImpl::exec( + _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask, + _megdnn_workspace workspace) { + check_exec(inp.layout, oup.layout, mask.layout, workspace.size); + size_t length = inp.layout.total_nr_elems(); + uint64_t seed = param().seed; + + m_rng.ensure_seed(seed); + +#define cb(DType) \ + if (inp.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(forward( \ + inp.ptr(), oup.ptr(), mask.raw_ptr(), length, m_rng, \ + param().drop_prob)); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +void DropoutBackwardImpl::exec( + _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp, + _megdnn_workspace workspace) { + check_exec(doup.layout, mask.layout, dinp.layout, workspace.size); + size_t length = doup.layout.total_nr_elems(); + +#define cb(DType) \ + if (doup.layout.dtype == DType()) { \ + using T = typename DTypeTrait::ctype; \ + MEGDNN_DISPATCH_CPU_KERN_OPR(backward( \ + doup.ptr(), dinp.ptr(), mask.raw_ptr(), length, \ + param().drop_prob)); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +} // namespace naive +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/dropout/opr_impl.h b/dnn/src/naive/dropout/opr_impl.h new file mode 100644 index 00000000..f40de4e2 --- /dev/null +++ b/dnn/src/naive/dropout/opr_impl.h @@ -0,0 +1,49 @@ +/** + * \file dnn/src/naive/dropout/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include "src/naive/rng/opr_impl.h" + +namespace megdnn { +namespace naive { + +class DropoutForwardImpl final : public DropoutForward { + Xoroshiro128plus m_rng; + +public: + using DropoutForward::DropoutForward; + void exec( + _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask, + _megdnn_workspace workspace) override; + size_t get_mask_size_in_bytes(const TensorLayout& inp) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +class DropoutBackwardImpl final : public DropoutBackward { +public: + using DropoutBackward::DropoutBackward; + void exec( + _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +} // namespace naive +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp new file mode 100644 index 00000000..1c135917 --- /dev/null +++ b/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp @@ -0,0 +1,138 @@ +/** + * \file dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "megdnn/tensor_iter.h" +#include "src/common/elemwise/kern_defs.cuh" +#include "src/common/elemwise_multi_type/kern_defs.cuh" +#include "src/naive/handle.h" + +using namespace megdnn; +using namespace naive; + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( + const ElemwiseOpParamN<3>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + dst_ptr[i] = (*i0) * (*i1) + (*i2); + ++i0; + ++i1; + ++i2; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16xf32xf32xf32( + const ElemwiseOpParamN<3>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + dst_ptr[i] = (*i0) * (*i1) + (*i2); + ++i0; + ++i1; + ++i2; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_uint8xf32xf32xf32( + const ElemwiseOpParamN<3>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + dst_ptr[i] = (*i0) * (*i1) + (*i2); + ++i0; + ++i1; + ++i2; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +void ElemwiseMultiTypeImpl::on_mul_int16xf32xf32( + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto work = [src0, src1, size, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + dst_ptr[i] = (*i0) * (*i1); + ++i0; + ++i1; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( + const ElemwiseOpParamN<3>& param, const TensorND& dst) { + switch (param[0].layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + return dispatch_fma3_iXxf32xf32xi8::ctype>(param, dst); + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb + default: + megdnn_throw("unsupported src dtype"); + } +} + +template +void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8( + const ElemwiseOpParamN<3>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto work = [src0, src1, src2, size, dst]() { + elemwise_multi_type::Fma3iXxf32xf32xiYOp op; + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + dst_ptr[i] = op(*i0, *i1, *i2); + ++i0; + ++i1; + ++i2; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp new file mode 100644 index 00000000..45861c78 --- /dev/null +++ b/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp @@ -0,0 +1,115 @@ +/** + * \file dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "megdnn/tensor_iter.h" +#include "src/common/elemwise/kern_defs.cuh" +#include "src/common/elemwise_multi_type/kern_defs.cuh" +#include "src/naive/handle.h" + +using namespace megdnn; +using namespace naive; + +void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + switch (param[0].layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + return dispatch_round_shr_saturate_iXxi8xiX::ctype, dt_int8>( \ + param, dst); + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb + default: + megdnn_throw("unsupported src dtype"); + } +} + +template +void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX( + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + auto src0 = param[0]; + auto src1 = param[1]; + auto size = param.size; + auto work = [src0, src1, size, dst]() { + // This is needed as these iterators are captured as const value. + auto iA = tensor_iter_valonly(src0).begin(); + auto iB = tensor_iter_valonly(src1).begin(); + auto pD = dst.ptr(); + for (size_t i = 0; i < size; i++) { + *pD = elemwise_multi_type::round_shr_saturate(*iA, *iB); + ++iA; + ++iB; + ++pD; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} +template +void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate( + const ElemwiseOpParamN<6>& param, const TensorND& dst) { + auto size = param.size; + auto src0 = param[0]; + auto src1 = param[1]; + auto src2 = param[2]; + auto src3 = param[3]; + auto src4 = param[4]; + auto src5 = param[5]; + auto work = [size, src0, src1, src2, src3, src4, src5, dst]() { + auto i0 = tensor_iter_valonly(src0).begin(); + auto i1 = tensor_iter_valonly(src1).begin(); + auto i2 = tensor_iter_valonly(src2).begin(); + auto ioff = tensor_iter_valonly(src3).begin(); + auto imin = tensor_iter_valonly(src4).begin(); + auto imax = tensor_iter_valonly(src5).begin(); + auto dst_ptr = dst.ptr(); + for (size_t i = 0; i < size; ++i) { + auto res = elemwise_multi_type::round_shr_saturate( + round_mulh_saturate(*i0 + *i1, *i2), *ioff); + res = std::min(res, *imax); + res = std::max(res, *imin); + dst_ptr[i] = res; + ++i0; + ++i1; + ++i2; + ++ioff; + ++imin; + ++imax; + } + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(work()); +} + +void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( + const ElemwiseOpParamN<6>& param, const TensorND& dst) { + dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); +} + +void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( + const ElemwiseOpParamN<6>& param, const TensorND& dst) { + dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); +} + +void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( + const ElemwiseOpParamN<2>& param, const TensorND& dst) { + switch (param[0].layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + return dispatch_round_shr_saturate_iXxi8xiX::ctype, dt_int16>( \ + param, dst); + cb(::megdnn::dtype::Int32); + cb(::megdnn::dtype::Int16); +#undef cb + default: + megdnn_throw("unsupported src dtype"); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp similarity index 56% rename from dnn/src/naive/elemwise_multi_type/opr_impl.cpp rename to dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp index 502cef74..67c21b12 100644 --- a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/naive/elemwise_multi_type/opr_impl.cpp + * \file dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -18,218 +18,6 @@ using namespace megdnn; using namespace naive; -void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( - const ElemwiseOpParamN<3>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto src2 = param[2]; - auto work = [src0, src1, src2, size, dst]() { - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto i2 = tensor_iter_valonly(src2).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - dst_ptr[i] = (*i0) * (*i1) + (*i2); - ++i0; - ++i1; - ++i2; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16xf32xf32xf32( - const ElemwiseOpParamN<3>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto src2 = param[2]; - auto work = [src0, src1, src2, size, dst]() { - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto i2 = tensor_iter_valonly(src2).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - dst_ptr[i] = (*i0) * (*i1) + (*i2); - ++i0; - ++i1; - ++i2; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_fuse_mul_add3_uint8xf32xf32xf32( - const ElemwiseOpParamN<3>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto src2 = param[2]; - auto work = [src0, src1, src2, size, dst]() { - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto i2 = tensor_iter_valonly(src2).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - dst_ptr[i] = (*i0) * (*i1) + (*i2); - ++i0; - ++i1; - ++i2; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_mul_int16xf32xf32( - const ElemwiseOpParamN<2>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto work = [src0, src1, size, dst]() { - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - dst_ptr[i] = (*i0) * (*i1); - ++i0; - ++i1; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, const TensorND& dst) { - switch (param[0].layout.dtype.enumv()) { -#define cb(t) \ - case DTypeTrait::enumv: \ - return dispatch_fma3_iXxf32xf32xi8::ctype>(param, dst); - MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) -#undef cb - default: - megdnn_throw("unsupported src dtype"); - } -} - -template -void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8( - const ElemwiseOpParamN<3>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto src2 = param[2]; - auto work = [src0, src1, src2, size, dst]() { - elemwise_multi_type::Fma3iXxf32xf32xiYOp op; - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto i2 = tensor_iter_valonly(src2).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - dst_ptr[i] = op(*i0, *i1, *i2); - ++i0; - ++i1; - ++i2; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( - const ElemwiseOpParamN<2>& param, const TensorND& dst) { - switch (param[0].layout.dtype.enumv()) { -#define cb(t) \ - case DTypeTrait::enumv: \ - return dispatch_round_shr_saturate_iXxi8xiX::ctype, dt_int8>( \ - param, dst); - MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) -#undef cb - default: - megdnn_throw("unsupported src dtype"); - } -} - -template -void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX( - const ElemwiseOpParamN<2>& param, const TensorND& dst) { - auto src0 = param[0]; - auto src1 = param[1]; - auto size = param.size; - auto work = [src0, src1, size, dst]() { - // This is needed as these iterators are captured as const value. - auto iA = tensor_iter_valonly(src0).begin(); - auto iB = tensor_iter_valonly(src1).begin(); - auto pD = dst.ptr(); - for (size_t i = 0; i < size; i++) { - *pD = elemwise_multi_type::round_shr_saturate(*iA, *iB); - ++iA; - ++iB; - ++pD; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -template -void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate( - const ElemwiseOpParamN<6>& param, const TensorND& dst) { - auto size = param.size; - auto src0 = param[0]; - auto src1 = param[1]; - auto src2 = param[2]; - auto src3 = param[3]; - auto src4 = param[4]; - auto src5 = param[5]; - auto work = [size, src0, src1, src2, src3, src4, src5, dst]() { - auto i0 = tensor_iter_valonly(src0).begin(); - auto i1 = tensor_iter_valonly(src1).begin(); - auto i2 = tensor_iter_valonly(src2).begin(); - auto ioff = tensor_iter_valonly(src3).begin(); - auto imin = tensor_iter_valonly(src4).begin(); - auto imax = tensor_iter_valonly(src5).begin(); - auto dst_ptr = dst.ptr(); - for (size_t i = 0; i < size; ++i) { - auto res = elemwise_multi_type::round_shr_saturate( - round_mulh_saturate(*i0 + *i1, *i2), *ioff); - res = std::min(res, *imax); - res = std::max(res, *imin); - dst_ptr[i] = res; - ++i0; - ++i1; - ++i2; - ++ioff; - ++imin; - ++imax; - } - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(work()); -} - -void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( - const ElemwiseOpParamN<6>& param, const TensorND& dst) { - dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); -} - -void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( - const ElemwiseOpParamN<6>& param, const TensorND& dst) { - dispatch_fuse_add_rmulh_round_shr_saturate(param, dst); -} - -void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( - const ElemwiseOpParamN<2>& param, const TensorND& dst) { - switch (param[0].layout.dtype.enumv()) { -#define cb(t) \ - case DTypeTrait::enumv: \ - return dispatch_round_shr_saturate_iXxi8xiX::ctype, dt_int16>( \ - param, dst); - cb(::megdnn::dtype::Int32); - cb(::megdnn::dtype::Int16); -#undef cb - default: - megdnn_throw("unsupported src dtype"); - } -} - template void ElemwiseMultiTypeImpl::dispatch_add_qint_op( const ElemwiseOpParamN<1>& param, const TensorND& dst_tensor) { diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp index e38bfead..2a705335 100644 --- a/dnn/src/naive/handle.cpp +++ b/dnn/src/naive/handle.cpp @@ -36,6 +36,7 @@ #include "src/naive/deformable_conv/opr_impl.h" #include "src/naive/deformable_ps_roi_pooling/opr_impl.h" #include "src/naive/dot/opr_impl.h" +#include "src/naive/dropout/opr_impl.h" #include "src/naive/elemwise/opr_impl.h" #include "src/naive/elemwise_multi_type/opr_impl.h" #include "src/naive/eye/opr_impl.h" @@ -47,6 +48,7 @@ #include "src/naive/images2neibs/opr_impl.h" #include "src/naive/indexing_multi_axis_vec/opr_impl.h" #include "src/naive/indexing_one_hot/opr_impl.h" +#include "src/naive/layer_norm/opr_impl.h" #include "src/naive/linspace/opr_impl.h" #include "src/naive/local/opr_impl.h" #include "src/naive/local_share/opr_impl.h" diff --git a/dnn/src/naive/layer_norm/opr_impl.cpp b/dnn/src/naive/layer_norm/opr_impl.cpp new file mode 100644 index 00000000..cc967035 --- /dev/null +++ b/dnn/src/naive/layer_norm/opr_impl.cpp @@ -0,0 +1,170 @@ +/** + * \file dnn/src/naive/layer_norm/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/naive/layer_norm/opr_impl.h" +#include +#include "src/common/utils.h" +#include "src/naive/handle.h" + +using namespace megdnn; +using namespace naive; + +namespace { + +using Param = megdnn::LayerNorm::Param; + +template +void forward( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + const Param& param) { + float eps = param.eps; + bool affine = param.affine; + uint64_t slice_length = param.normalized_size; + uint64_t slice_dim = param.normalized_dim; + uint64_t n_slices = 1; + for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) { + n_slices = n_slices * data.layout.shape[i]; + } + + for (size_t i = 0; i < n_slices; i++) { + T_ACC slice_sum = static_cast(0.0f); + for (size_t j = 0; j < slice_length; j++) { + auto value = data.ptr()[i * slice_length + j]; + slice_sum += value; + } + T_ACC slice_mean = static_cast(slice_sum / slice_length); + + T_ACC slice_var = static_cast(0.0f); + for (size_t j = 0; j < slice_length; j++) { + slice_var += (data.ptr()[i * slice_length + j] - slice_mean) * + (data.ptr()[i * slice_length + j] - slice_mean); + } + slice_var = slice_var / slice_length; + + T_ACC slice_std = static_cast(sqrt(slice_var + eps)); + for (size_t j = 0; j < slice_length; j++) { + dst.ptr()[i * slice_length + j] = + (data.ptr()[i * slice_length + j] - slice_mean) / slice_std; + if (affine) { + dst.ptr()[i * slice_length + j] = + dst.ptr()[i * slice_length + j] * weight.ptr()[j] + + bias.ptr()[j]; + } + } + mean.ptr()[i] = static_cast(slice_mean); + rstd.ptr()[i] = static_cast(1.0 / slice_std); + } +} + +template +void backward( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, const Param& param) { + bool affine = param.affine; + uint64_t slice_length = param.normalized_size; + uint64_t slice_dim = param.normalized_dim; + uint64_t n_slices = 1; + for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) { + n_slices = n_slices * data.layout.shape[i]; + } + + if (affine) { + for (size_t i = 0; i < slice_length; ++i) { + dweight.ptr()[i] = 0; + dbias.ptr()[i] = 0; + } + + for (size_t i = 0; i < n_slices; ++i) { + for (size_t j = 0; j < slice_length; ++j) { + dweight.ptr()[j] += + (data.ptr()[i * slice_length + j] - mean.ptr()[i]) * + rstd.ptr()[i] * diff.ptr()[i * slice_length + j]; + + dbias.ptr()[j] += diff.ptr()[i * slice_length + j]; + } + } + } + + for (size_t i = 0; i < n_slices; ++i) { + T_ACC ds = static_cast(0.0f); + T_ACC db = static_cast(0.0f); + T_ACC a = static_cast(0.0f); + T_ACC b = static_cast(0.0f); + T_ACC c = static_cast(0.0f); + + for (size_t j = 0; j < slice_length; ++j) { + auto value = data.ptr()[i * slice_length + j]; + auto diff_v = diff.ptr()[i * slice_length + j]; + auto weight_v = affine ? weight.ptr()[j] : static_cast(1.0f); + db += diff_v * weight_v; + ds += diff_v * value * weight_v; + } + + a = rstd.ptr()[i]; + b = (db * mean.ptr()[i] - ds) * a * a * a / slice_length; + c = -b * mean.ptr()[i] - db * a / slice_length; + + for (uint64_t j = 0; j < slice_length; j++) { + auto weight_v = affine ? weight.ptr()[j] : static_cast(1.0f); + ddata.ptr()[i * slice_length + j] = + diff.ptr()[i * slice_length + j] * a * weight_v + + data.ptr()[i * slice_length + j] * b + c; + } + } +} + +} // namespace + +namespace megdnn { +namespace naive { + +void LayerNormForwardImpl::exec( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + _megdnn_workspace workspace) { + check_exec( + data.layout, weight.layout, bias.layout, dst.layout, mean.layout, + rstd.layout, workspace.size); +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(forward::ctype>( \ + data, weight, bias, dst, mean, rstd, param())); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +void LayerNormBackwardImpl::exec( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, + _megdnn_workspace workspace) { + check_exec( + diff.layout, data.layout, weight.layout, mean.layout, rstd.layout, + ddata.layout, dweight.layout, dbias.layout, workspace.size); +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(backward::ctype>( \ + diff, data, weight, mean, rstd, ddata, dweight, dbias, param())); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + megdnn_throw("bad dtype"); +} + +} // namespace naive +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/layer_norm/opr_impl.h b/dnn/src/naive/layer_norm/opr_impl.h new file mode 100644 index 00000000..99d93e79 --- /dev/null +++ b/dnn/src/naive/layer_norm/opr_impl.h @@ -0,0 +1,51 @@ +/** + * \file dnn/src/naive/layer_norm/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace naive { + +class LayerNormForwardImpl final : public LayerNormForward { +public: + using LayerNormForward::LayerNormForward; + void exec( + _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, + _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +class LayerNormBackwardImpl final : public LayerNormBackward { +public: + using LayerNormBackward::LayerNormBackward; + void exec( + _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight, + _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata, + _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes( + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&) override { + return 0; + } +}; + +} // namespace naive +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/matrix_mul/opr_impl.cpp b/dnn/src/x86/matrix_mul/opr_impl.cpp index 176a428c..605142a0 100644 --- a/dnn/src/x86/matrix_mul/opr_impl.cpp +++ b/dnn/src/x86/matrix_mul/opr_impl.cpp @@ -52,7 +52,6 @@ public: m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2); m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2); m_all_algos.emplace_back(&algof32mk8_8x8); - m_all_algos.emplace_back(&algof32_6x16); #if MEGDNN_X86_WITH_MKL_DNN m_all_algos.emplace_back(&algoint8x8x32mkldnn); #endif @@ -60,6 +59,7 @@ public: #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM m_all_algos.emplace_back(&f32mkl_packa); #endif + m_all_algos.emplace_back(&algof32_6x16); for (auto&& algo : m_all_algos) { m_all_algos_map.emplace(algo->info().desc, algo); diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt index 1527dcab..858d1dcc 100644 --- a/dnn/test/CMakeLists.txt +++ b/dnn/test/CMakeLists.txt @@ -5,38 +5,38 @@ file(GLOB SOURCES_ *.cpp) list(APPEND SOURCES ${SOURCES_}) if(NOT ${MGE_ARCH} STREQUAL "naive") - file(GLOB_RECURSE SOURCES_ fallback/*.cpp) + file(GLOB_RECURSE SOURCES_ fallback/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ cpu/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + if(${MGE_ARCH} STREQUAL "fallback") + message(WARNING "build only with fallback") + elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + file(GLOB_RECURSE SOURCES_ x86/*.cpp) list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE SOURCES_ cpu/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - if(${MGE_ARCH} STREQUAL "fallback") - message(WARNING "build only with fallback") - elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") - file(GLOB_RECURSE SOURCES_ x86/*.cpp) - list(APPEND SOURCES ${SOURCES_}) - endif() + endif() endif() if(MGE_WITH_CUDA) - file(GLOB_RECURSE SOURCES_ cuda/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ cuda/*.cpp) + list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE CUSOURCES cuda/*.cu) - list(APPEND SOURCES ${CUSOURCES}) + file(GLOB_RECURSE CUSOURCES cuda/*.cu) + list(APPEND SOURCES ${CUSOURCES}) endif() if(MGE_WITH_MIDOUT_PROFILE) - list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp) + list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp) endif() if(MGE_WITH_ATLAS) - file(GLOB_RECURSE SOURCES_ atlas/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ atlas/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() -if (MGE_WITH_ROCM) - file (GLOB_RECURSE SOURCES_ rocm/*.cpp) - list (APPEND SOURCES ${SOURCES_}) +if(MGE_WITH_ROCM) + file(GLOB_RECURSE SOURCES_ rocm/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() add_executable(megdnn_test ${SOURCES}) @@ -44,37 +44,36 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") target_link_libraries(megdnn_test gtest) target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS} ${MGE_CUDA_LIBS}) -if (MGE_WITH_CUDA) - target_link_libraries(megdnn_test cutlass) - target_include_directories(megdnn_test PRIVATE ${CUDNN_INCLUDE_DIR}) +if(MGE_WITH_CUDA) + target_link_libraries(megdnn_test cutlass) + target_include_directories(megdnn_test PRIVATE ${CUDNN_INCLUDE_DIR}) endif() if(MGE_WITH_ATLAS) - target_link_libraries(megdnn_test atlas-stub) + target_link_libraries(megdnn_test atlas-stub) endif() target_include_directories(megdnn_test - PRIVATE - ${PROJECT_SOURCE_DIR}/third_party/midout/src -) + PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src) if(APPLE OR ANDROID) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") else() - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++") + set(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++") endif() if(MGE_ENABLE_COVERAGE) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") endif() -if (MEG_WITH_ROCM) - target_link_libraries (megdnn_test ${MGE_ROCM_LIBS}) -endif () +if(MEG_WITH_ROCM) + target_link_libraries(megdnn_test ${MGE_ROCM_LIBS}) +endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(megdnn_test dl) - else() - target_link_libraries(megdnn_test dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(megdnn_test dl) + else() + target_link_libraries(megdnn_test dl rt) + endif() endif() diff --git a/dnn/test/common/deduce_layout_proxy.h b/dnn/test/common/deduce_layout_proxy.h index 17afc1dd..f1067aec 100644 --- a/dnn/test/common/deduce_layout_proxy.h +++ b/dnn/test/common/deduce_layout_proxy.h @@ -58,6 +58,15 @@ struct DeduceLayoutProxy { }; template +struct DeduceLayoutProxy { + static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) { + megdnn_assert(layouts.size() == 6); + opr->deduce_layout( + layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5]); + } +}; + +template struct DeduceLayoutProxy { static void deduce_layout(Opr*, TensorLayoutArray&) {} }; diff --git a/dnn/test/cuda/accuracy_shake.cpp b/dnn/test/cuda/accuracy_shake.cpp index 68463633..41b116b2 100644 --- a/dnn/test/cuda/accuracy_shake.cpp +++ b/dnn/test/cuda/accuracy_shake.cpp @@ -97,7 +97,7 @@ TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NHWC) { TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NCHWX) { using Format = ConvBias::Param::Format; - require_compute_capability(6, 1); + require_compute_capability(7, 5); AccuracyShakeChecker checker(handle_cuda()); UniformIntRNG int_rng{-5, 5}; UniformFloatRNG float_rng{-50, 50}; diff --git a/dnn/test/cuda/layer_norm.cpp b/dnn/test/cuda/layer_norm.cpp new file mode 100644 index 00000000..b4d04204 --- /dev/null +++ b/dnn/test/cuda/layer_norm.cpp @@ -0,0 +1,94 @@ +/** + * \file dnn/test/cuda/layer_norm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "test/cuda/fixture.h" + +#include "test/common/checker.h" + +namespace megdnn { +namespace test { + +TEST_F(CUDA, LAYERNORM_FORWARD) { + using Param = LayerNormForward::Param; + Param param; + param.affine = true; + param.eps = 1e-6; + param.normalized_dim = 1; + Checker checker(handle_cuda()); + checker.set_epsilon(1e-2); + + auto run = [&](DType d) { + for (size_t n_slices : {10, 30}) + for (size_t slice_len : {10, 30}) { + param.normalized_size = slice_len; + checker.set_param(param) + .set_dtype(0, d) + .set_dtype(1, d) + .set_dtype(2, d) + .set_dtype(3, d) + .set_dtype(4, dtype::Float32()) + .set_dtype(5, dtype::Float32()) + .execs({{n_slices, slice_len}, + {slice_len}, + {slice_len}, + {n_slices, slice_len}, + {n_slices}, + {n_slices}}); + } + }; + + run(dtype::Float32()); + run(dtype::Float16()); + run(dtype::BFloat16()); +} + +TEST_F(CUDA, LAYERNORM_BACKWARD) { + using Param = LayerNormBackward::Param; + Param param; + param.affine = true; + param.eps = 1e-6; + param.normalized_dim = 1; + Checker checker(handle_cuda()); + checker.set_epsilon(1e-1); + + auto run = [&](DType d) { + for (size_t n_slices : {10, 30}) + for (size_t slice_len : {10, 30}) { + param.normalized_size = slice_len; + checker.set_param(param) + .set_dtype(0, d) + .set_dtype(1, d) + .set_dtype(2, d) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_dtype(5, d) + .set_dtype(6, d) + .set_dtype(7, d) + .execs({{n_slices, slice_len}, + {n_slices, slice_len}, + {slice_len}, + {n_slices}, + {n_slices}, + {n_slices, slice_len}, + {slice_len}, + {slice_len}}); + } + }; + + run(dtype::Float32()); + run(dtype::Float16()); + run(dtype::BFloat16()); +} + +} // namespace test +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/cuda/rng.cpp b/dnn/test/cuda/rng.cpp index 0a5a549d..3b0b705d 100644 --- a/dnn/test/cuda/rng.cpp +++ b/dnn/test/cuda/rng.cpp @@ -193,6 +193,70 @@ void run_shuffle(Handle* handle, bool bwd_flag) { run({6, 3}); } +template +void run_dropout(Handle* handle) { + using ctype = typename DTypeTrait::ctype; + auto run = [&](TensorShape shape, float drop_prob) { + auto fwd = handle->create_operator(); + auto bwd = handle->create_operator(); + fwd->param().drop_prob = drop_prob; + bwd->param().drop_prob = drop_prob; + double scale = 1.0 / (1.0 - drop_prob); + + TensorLayout inp_lay{shape, T()}; + TensorLayout oup_lay{shape, T()}; + TensorLayout mask_lay{{fwd->get_mask_size_in_bytes(inp_lay)}, dtype::Byte()}; + TensorLayout doup_lay{shape, T()}; + TensorLayout dinp_lay{shape, T()}; + TensorLayout fwd_ws_lay{ + {fwd->get_workspace_in_bytes(inp_lay, oup_lay, mask_lay)}, + dtype::Byte()}; + TensorLayout bwd_ws_lay{ + {bwd->get_workspace_in_bytes(doup_lay, mask_lay, dinp_lay)}, + dtype::Byte()}; + + SyncedTensor inp(handle, inp_lay); + SyncedTensor oup(handle, oup_lay); + SyncedTensor::ctype> mask(handle, mask_lay); + SyncedTensor doup(handle, doup_lay); + SyncedTensor dinp(handle, dinp_lay); + SyncedTensor::ctype> fwd_ws(handle, fwd_ws_lay); + SyncedTensor::ctype> bwd_ws(handle, bwd_ws_lay); + + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + inp.ptr_mutable_host()[i] = 1; + doup.ptr_mutable_host()[i] = 1; + } + + fwd->exec( + inp.tensornd_dev(), oup.tensornd_dev(), mask.tensornd_dev(), + {fwd_ws.ptr_mutable_dev(), fwd_ws.layout().total_nr_elems()}); + size_t droped_cnt = 0; + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + ASSERT_TRUE( + oup.ptr_host()[i] == 0 || + oup.ptr_host()[i] == static_cast(scale)); + if (oup.ptr_host()[i] == 0) { + droped_cnt++; + } + } + float real_drop = droped_cnt * 1.0 / inp.layout().total_nr_elems(); + ASSERT_LT(abs(drop_prob - real_drop), 1e-2); + +#if CUDNN_VERSION >= 7000 + bwd->exec( + doup.tensornd_dev(), mask.tensornd_dev(), dinp.tensornd_dev(), + {bwd_ws.ptr_mutable_dev(), bwd_ws.layout().total_nr_elems()}); + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + ASSERT_TRUE(oup.ptr_host()[i] == dinp.ptr_host()[i]); + } +#endif + }; + + run({32, 32, 32, 32}, 0.2); + run({100000}, 0.3); +} + } // anonymous namespace TEST_F(CUDA, UNIFORM_RNG_F32) { @@ -290,6 +354,14 @@ TEST_F(CUDA, SHUFFLE_RNG_BWD_F16) { run_shuffle(handle_cuda(), true); } +TEST_F(CUDA, DROPOUT_F32) { + run_dropout(handle_cuda()); +} + +TEST_F(CUDA, DROPOUT_F16) { + run_dropout(handle_cuda()); +} + } // namespace test } // namespace megdnn diff --git a/dnn/test/naive/rng.cpp b/dnn/test/naive/rng.cpp index b5a827ad..40f1c520 100644 --- a/dnn/test/naive/rng.cpp +++ b/dnn/test/naive/rng.cpp @@ -231,6 +231,67 @@ void run_shuffle(Handle* handle, bool bwd_flag) { run({10}); run({6, 3}); } + +template +void run_dropout(Handle* handle) { + using ctype = typename DTypeTrait::ctype; + auto run = [&](TensorShape shape, float drop_prob) { + auto fwd = handle->create_operator(); + auto bwd = handle->create_operator(); + fwd->param().drop_prob = drop_prob; + bwd->param().drop_prob = drop_prob; + double scale = 1.0 / (1.0 - drop_prob); + + TensorLayout inp_lay{shape, T()}; + TensorLayout oup_lay{shape, T()}; + TensorLayout mask_lay{{fwd->get_mask_size_in_bytes(inp_lay)}, dtype::Byte()}; + TensorLayout doup_lay{shape, T()}; + TensorLayout dinp_lay{shape, T()}; + TensorLayout fwd_ws_lay{ + {fwd->get_workspace_in_bytes(inp_lay, oup_lay, mask_lay)}, + dtype::Byte()}; + TensorLayout bwd_ws_lay{ + {bwd->get_workspace_in_bytes(doup_lay, mask_lay, dinp_lay)}, + dtype::Byte()}; + + Tensor inp(handle, inp_lay); + Tensor oup(handle, oup_lay); + Tensor::ctype> mask(handle, mask_lay); + Tensor doup(handle, doup_lay); + Tensor dinp(handle, dinp_lay); + Tensor::ctype> fwd_ws(handle, fwd_ws_lay); + Tensor::ctype> bwd_ws(handle, bwd_ws_lay); + + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + inp.ptr()[i] = 1; + doup.ptr()[i] = 1; + } + + fwd->exec( + inp.tensornd(), oup.tensornd(), mask.tensornd(), + {fwd_ws.ptr(), fwd_ws.layout().total_nr_elems()}); + size_t droped_cnt = 0; + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + ASSERT_TRUE(oup.ptr()[i] == 0 || oup.ptr()[i] == static_cast(scale)); + if (oup.ptr()[i] == 0) { + droped_cnt++; + } + } + float real_drop = droped_cnt * 1.0 / inp.layout().total_nr_elems(); + ASSERT_LT(abs(drop_prob - real_drop), 1e-2); + + bwd->exec( + doup.tensornd(), mask.tensornd(), dinp.tensornd(), + {bwd_ws.ptr(), bwd_ws.layout().total_nr_elems()}); + for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) { + ASSERT_TRUE(oup.ptr()[i] == dinp.ptr()[i]); + } + }; + + run({32, 32, 32, 32}, 0.2); + run({100000}, 0.3); +} + } // namespace TEST_F(NAIVE, UNIFORM_RNG_F32) { @@ -309,6 +370,14 @@ TEST_F(NAIVE, SHUFFLE_RNG_BWD_F16) { run_shuffle(handle(), true); } +TEST_F(NAIVE, DROPOUT_F32) { + run_dropout(handle()); +} + +TEST_F(NAIVE, DROPOUT_F16) { + run_dropout(handle()); +} + } // namespace test } // namespace megdnn diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt index cf1a8c35..3d122663 100644 --- a/imperative/CMakeLists.txt +++ b/imperative/CMakeLists.txt @@ -1,21 +1,28 @@ find_package(NumPy REQUIRED) set(PACKAGE_NAME megengine) -set(PACKAGE_NAME ${PACKAGE_NAME} PARENT_SCOPE) +set(PACKAGE_NAME + ${PACKAGE_NAME} + PARENT_SCOPE) set(MODULE_NAME _imperative_rt) -set(MODULE_NAME ${MODULE_NAME} PARENT_SCOPE) +set(MODULE_NAME + ${MODULE_NAME} + PARENT_SCOPE) file(GLOB_RECURSE SRCS src/impl/*.cpp src/include/*.h python/src/*.cpp python/src/*.h) set(SRCS ${SRCS} ${CPP_REDIS_SRCS}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_WITH_IMPERATIVE=1") file(GLOB_RECURSE PYTHON_SRCS python/${PACKAGE_NAME}/*.py) -file(GLOB_RECURSE ALL_HEADERS src/cpp/megbrain_pubapi.h - ${PROJECT_SOURCE_DIR}/src/core/include/* - ${PROJECT_SOURCE_DIR}/src/opr/include/* - ${PROJECT_SOURCE_DIR}/src/serialization/include/* - ${PROJECT_SOURCE_DIR}/src/plugin/include/* - ${PROJECT_SOURCE_DIR}/dnn/include/*) +file( + GLOB_RECURSE + ALL_HEADERS + src/cpp/megbrain_pubapi.h + ${PROJECT_SOURCE_DIR}/src/core/include/* + ${PROJECT_SOURCE_DIR}/src/opr/include/* + ${PROJECT_SOURCE_DIR}/src/serialization/include/* + ${PROJECT_SOURCE_DIR}/src/plugin/include/* + ${PROJECT_SOURCE_DIR}/dnn/include/*) set(MEGENGINE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/) @@ -23,71 +30,106 @@ add_subdirectory(tablegen) add_custom_target(_version_ld SOURCES ${MGE_VERSION_SCRIPT}) -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 + ${PROJECT_BINARY_DIR}/third_party/pybind11) pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS}) -if (APPLE) - target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared) -elseif (MSVC OR WIN32) - target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared) - message(STATUS "CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}") - set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "${CMAKE_MSVC_RUNTIME_LIBRARY}") +if(APPLE) + target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared) +elseif(MSVC OR WIN32) + target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared) + message(STATUS "CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}") + set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY + "${CMAKE_MSVC_RUNTIME_LIBRARY}") else() - # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and imperative(MGE_BUILD_IMPERATIVE_RT) - target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared -Wl,--version-script=${MGE_VERSION_SCRIPT}) + # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and + # imperative(MGE_BUILD_IMPERATIVE_RT) + target_link_libraries( + ${MODULE_NAME} PRIVATE megengine_shared -Wl,--version-script=${MGE_VERSION_SCRIPT}) endif() -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 + ${PROJECT_BINARY_DIR}/third_party/range-v3) target_link_libraries(${MODULE_NAME} PRIVATE range-v3) -add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json + ${PROJECT_BINARY_DIR}/third_party/Json) target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json) -target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR} ${CPP_REDIS_INCLUDES}) +target_include_directories( + ${MODULE_NAME} + PUBLIC src/include + PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR} + ${CPP_REDIS_INCLUDES}) target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) if(CXX_SUPPORT_WCLASS_MEMACCESS) - target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess") + target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess") endif() -set_target_properties(${MODULE_NAME} PROPERTIES - SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX} - LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core -) -if (APPLE OR MSVC OR WIN32) - message(VERBOSE "overwriting SUFFIX at macos and windows before config by set_target_properties") - pybind11_extension(${MODULE_NAME}) +set_target_properties( + ${MODULE_NAME} + PROPERTIES SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX} + LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core) +if(APPLE + OR MSVC + OR WIN32) + message( + VERBOSE + "overwriting SUFFIX at macos and windows before config by set_target_properties") + pybind11_extension(${MODULE_NAME}) endif() add_dependencies(${MODULE_NAME} mgb_opdef _version_ld) if(MGE_WITH_TEST AND MGE_ENABLE_RTTI) - add_subdirectory(test) + add_subdirectory(test) endif() add_custom_command( - TARGET ${MODULE_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$ # clean develop - COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop - COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine - COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt -) + TARGET ${MODULE_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE + ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR} + COMMAND + ${CMAKE_COMMAND} -E remove -f + ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$ # clean + # develop + COMMAND ${CMAKE_COMMAND} -E remove -f + ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop + COMMAND ${CMAKE_COMMAND} -E remove -f + ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/include # clean develop + COMMAND ${CMAKE_COMMAND} -E remove -f + ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/lib # clean develop + COMMAND + ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine + ${CMAKE_CURRENT_BINARY_DIR}/python/megengine + COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test + ${CMAKE_CURRENT_BINARY_DIR}/python/test + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/custom/include + ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/core/include + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py + ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt + ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt + ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt + ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt) if(DEFINED MGB_VER_MAJOR) - set(IS_INTERNAL "--internal") + set(IS_INTERNAL "--internal") else() - set(IS_INTERNAL "") + set(IS_INTERNAL "") endif(DEFINED MGB_VER_MAJOR) if(DEFINED MGE_EXTRA_NAME) - set(RC_NAME "--rc=${MGE_EXTRA_NAME}") + set(RC_NAME "--rc=${MGE_EXTRA_NAME}") else() - set(RC_NAME "") + set(RC_NAME "") endif(DEFINED MGE_EXTRA_NAME) add_custom_command( - TARGET ${MODULE_NAME} POST_BUILD - COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py --major ${MGE_VER_MAJOR} --minor ${MGE_VER_MINOR} --patch ${MGE_VER_PATCH} ${RC_NAME} ${IS_INTERNAL} -) + TARGET ${MODULE_NAME} + POST_BUILD + COMMAND + "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output + ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py --major ${MGE_VER_MAJOR} + --minor ${MGE_VER_MINOR} --patch ${MGE_VER_PATCH} ${RC_NAME} ${IS_INTERNAL}) diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py index 38291bfd..e248be57 100644 --- a/imperative/python/megengine/__init__.py +++ b/imperative/python/megengine/__init__.py @@ -84,7 +84,7 @@ from .logger import enable_debug_log, get_logger, set_log_file, set_log_level from .serialization import load, save from .tensor import Parameter, Tensor, tensor from .utils import comp_graph_tools as cgtools -from .utils import persistent_cache +from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer from .version import __version__ _set_fork_exec_path_for_timed_func( @@ -92,15 +92,13 @@ _set_fork_exec_path_for_timed_func( os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), ) -atexit.register(_close) - del _set_fork_exec_path_for_timed_func _exit_handlers = [] def _run_exit_handlers(): - for handler in _exit_handlers: + for handler in reversed(_exit_handlers): handler() _exit_handlers.clear() @@ -117,6 +115,13 @@ def _atexit(handler): _exit_handlers.append(handler) +_atexit(_close) + +_persistent_cache = _PersistentCacheOnServer() +_persistent_cache.reg() + +_atexit(_persistent_cache.flush) + # subpackages import megengine.amp import megengine.autodiff @@ -132,5 +137,3 @@ import megengine.quantization import megengine.random import megengine.utils import megengine.traced_module - -persistent_cache.get_manager() diff --git a/imperative/python/megengine/core/ops/custom.py b/imperative/python/megengine/core/ops/custom.py index b1a055fd..b60527c3 100644 --- a/imperative/python/megengine/core/ops/custom.py +++ b/imperative/python/megengine/core/ops/custom.py @@ -7,11 +7,14 @@ # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import os + from .._imperative_rt.ops._custom import ( _get_custom_op_list, _install, _make_custom_op, _uninstall, + get_custom_op_abi_tag, ) __all__ = ["load"] @@ -25,8 +28,16 @@ def _gen_custom_op_maker(custom_op_name): def load(lib_path): - op_in_this_lib = _install(lib_path[0:-3], lib_path) + lib_path = os.path.abspath(lib_path) + lib_name = os.path.splitext(lib_path)[0] + op_in_this_lib = _install(lib_name, lib_path) for op in op_in_this_lib: op_maker = _gen_custom_op_maker(op) globals()[op] = op_maker __all__.append(op) + + +def unload(lib_path): + lib_path = os.path.abspath(lib_path) + lib_name = os.path.splitext(lib_path)[0] + _uninstall(lib_name) diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py index 9ababceb..025b30ea 100644 --- a/imperative/python/megengine/functional/nn.py +++ b/imperative/python/megengine/functional/nn.py @@ -13,10 +13,12 @@ from typing import NamedTuple, Optional, Sequence, Tuple, Union from ..core import _config from ..core._imperative_rt.core2 import apply, dtype_promotion from ..core._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder +from ..core._imperative_rt.ops import get_global_rng_seed as _get_global_rng_seed from ..core.ops import builtin from ..core.ops.builtin import ( BatchNorm, Dimshuffle, + Dropout, Elemwise, GetVarShape, Identity, @@ -39,7 +41,6 @@ from ..core.tensor.utils import ( from ..device import get_default_device from ..distributed import WORLD, is_distributed from ..jit import exclude_from_trace -from ..random import uniform from ..tensor import Tensor from ..utils.deprecation import deprecated_func from ..utils.tuple_function import _pair, _pair_nonzero, _triple, _triple_nonzero @@ -77,6 +78,7 @@ __all__ = [ "max_pool2d", "one_hot", "prelu", + "pad", "relu", "relu6", "remap", @@ -1066,57 +1068,6 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor: return cached / down -@lru_cache(maxsize=None) -def _get_layerNorm(device, dtype, dim, gopt_level=2): - @subgraph("LayerNormAffine", dtype, device, 5, gopt_level=gopt_level) - def layerNormAffine(inputs, f, c): - inp, eps, _flatten_shape, weight, bias = inputs - inp_shape = f(GetVarShape(), inp) - - inp = f(Reshape(axis=dim), inp, _flatten_shape) - mean = f(Reduce(mode="mean", axis=-1), inp) - x2s = f(Reduce(mode="sum_sqr", axis=-1), inp) - reduce_shape = f(GetVarShape(), x2s) - reduce_size = f( - "//", - f(Reduce(mode="product", axis=0), inp_shape), - f(Reduce(mode="product", axis=0), reduce_shape), - ) - reduce_size_f = f(TypeCvt(dtype=dtype), reduce_size) - var = f("-", f("/", x2s, reduce_size_f), f("**", mean, c(2))) - inv_sqrt_var = f("**", f("+", var, eps), c(-0.5)) - oup = f("fma3", inp, inv_sqrt_var, f("*", f("-", mean), inv_sqrt_var)) - affine_oup = f(Reshape(), oup, inp_shape) - affine_oup = f("fma3", affine_oup, weight, bias) - - # NOTE: return oup make backward faster but take more memory - return (affine_oup, oup, mean, x2s), (True, False, False, False) - - @subgraph("LayerNorm", dtype, device, 3, gopt_level=gopt_level) - def layerNorm(inputs, f, c): - inp, eps, _flatten_shape = inputs - inp_shape = f(GetVarShape(), inp) - - inp = f(Reshape(axis=dim), inp, _flatten_shape) - mean = f(Reduce(mode="mean", axis=-1), inp) - x2s = f(Reduce(mode="sum_sqr", axis=-1), inp) - reduce_shape = f(GetVarShape(), x2s) - reduce_size = f( - "//", - f(Reduce(mode="product", axis=0), inp_shape), - f(Reduce(mode="product", axis=0), reduce_shape), - ) - reduce_size_f = f(TypeCvt(dtype=dtype), reduce_size) - var = f("-", f("/", x2s, reduce_size_f), f("**", mean, c(2))) - inv_sqrt_var = f("**", f("+", var, eps), c(-0.5)) - oup = f("fma3", inp, inv_sqrt_var, f("*", f("-", mean), inv_sqrt_var)) - oup = f(Reshape(), oup, inp_shape) - - return (oup,), (True,) - - return (layerNorm, layerNormAffine) - - def layer_norm( inp: Tensor, normalized_shape: tuple, @@ -1133,32 +1084,34 @@ def layer_norm( normalized_shape: the shape that you want to be normalizated affine: whether to use weight and bias weight: must not be None when the affine is true - bias: must not be None when the bias is true + bias: must not be None when the affine is true eps: a value added to the denominator for numerical stability. Default: 1e-5 """ - if amp._enabled: inp, weight, bias = cast_tensors(inp, weight, bias, promote=True) - _device = inp.device - _dtype = inp.dtype - _dim = len(inp.shape) - len(normalized_shape) + if isinstance(normalized_shape, int): + normalized_shape = [normalized_shape] - _flatten_shape = concat( - ( - convert_single_value(inp.shape[:_dim], dtype="int32", device=inp.device), - convert_single_value(-1, dtype="int32", device=inp.device), - ) - ) - (layerNorm, layerNormAffine) = _get_layerNorm(_device, _dtype, _dim) + normalized_dim = len(normalized_shape) + assert normalized_dim > 0 - eps = convert_single_value(eps, dtype=inp.dtype, device=inp.device) + normalized_size = 1 + for i in range(normalized_dim): + normalized_size = normalized_size * normalized_shape[i] + + op = builtin.LayerNorm( + affine=affine, + eps=eps, + normalized_dim=normalized_dim, + normalized_size=normalized_size, + ) if affine: - outvar, *_ = apply(layerNormAffine(), inp, eps, _flatten_shape, weight, bias) + assert weight is not None and bias is not None + return apply(op, inp, weight, bias)[0] else: - outvar, *_ = apply(layerNorm(), inp, eps, _flatten_shape) - - return outvar + # assert weight is None and bias is None + return apply(op, inp)[0] def batch_norm( @@ -1552,12 +1505,9 @@ def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor: return inp # model in training mode, e.g. model.train() - rv = uniform(size=inp.shape) - mask = rv > drop_prob - ret = inp * mask.astype(inp.dtype) - ret *= 1 / (1 - drop_prob) - - return ret + op = Dropout(drop_prob=drop_prob, seed=_get_global_rng_seed(), handle=0) + outputs = apply(op, inp) + return outputs[0] def one_hot(inp: Tensor, num_classes: int) -> Tensor: diff --git a/imperative/python/megengine/functional/tensor.py b/imperative/python/megengine/functional/tensor.py index e8ff2e6a..ac29ae16 100755 --- a/imperative/python/megengine/functional/tensor.py +++ b/imperative/python/megengine/functional/tensor.py @@ -113,7 +113,7 @@ def full( data type must be inferred from ``value``. If the value is an ``int``, the output tensor data type must be the default integer data type. If the value is a ``float``, the output tensor data type must be the default - floating-point data type. If the value is a ``bool``, the output tensor + floating-point data type. If the value is a ``bool``, the output tensor must have boolean data type. Default: ``None``. device: device on which to place the created tensor. Default: ``None``. @@ -195,77 +195,65 @@ def ones( return full(shape, 1.0, dtype=dtype, device=device) -def zeros(shape, dtype="float32", device=None) -> Tensor: - r"""Returns a zero tensor with given shape. +def zeros( + shape: Union[int, Tuple[int, ...]], + *, + dtype="float32", + device: Optional[CompNode] = None +) -> Tensor: + r"""Returns a new tensor having a specified shape and filled with zeros. Args: - shape: a list, tuple or integer defining the shape of the output tensor. - dtype: the desired data type of the output tensor. Default: ``float32``. - device: the desired device of the output tensor. Default: if ``None``, - use the default device (see :func:`~.megengine.get_default_device`). + shape (int or sequence of ints): the shape of the output tensor. + + Keyword args: + dtype (:attr:`.Tensor.dtype`): output tensor data type. Default: ``float32``. + device (:attr:`.Tensor.device`): device on which to place the created tensor. Default: ``None``. + + Returns: + a tensor containing zeros. + + Examples: + >>> F.zeros((2, 1)) + Tensor([[0.] + [0.]], device=xpux:0) """ return full(shape, 0.0, dtype=dtype, device=device) def zeros_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]: - r"""Returns a zero tensor with the same shape as input tensor. + r"""Returns a tensor filled with zeros with the same shape and data type as input tensor. Args: - inp: input tensor. + inp (Tensor): input tensor. Return: - output tensor. + a tensor containing zeros. Examples: - - .. testcode:: - - import numpy as np - from megengine import tensor - import megengine.functional as F - - inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3)) - out = F.zeros_like(inp) - print(out.numpy()) - - Outputs: - - .. testoutput:: - - [[0 0 0] - [0 0 0]] - + >>> input = F.arange(9, dtype='int32').reshape(3,3) + >>> F.zeros_like(input) + Tensor([[0 0 0] + [0 0 0] + [0 0 0]], dtype=int32, device=xpux:0) """ return full_like(inp, 0.0) def ones_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]: - r"""Returns a ones tensor with the same shape as input tensor. + r"""Returns a tensor filled with ones with the same shape and data type as input tensor. Args: - inp: input tensor. + inp (Tensor): input tensor. Return: - output tensor. + a tensor containing ones. Examples: - - .. testcode:: - - import numpy as np - from megengine import tensor - import megengine.functional as F - - inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3)) - out = F.ones_like(inp) - print(out.numpy()) - - Outputs: - - .. testoutput:: - - [[1 1 1] - [1 1 1]] + >>> input = F.arange(6, dtype='int32').reshape(2,3) + >>> F.ones_like(input) + Tensor([[1 1 1] + [1 1 1]], dtype=int32, device=xpux:0) """ return full_like(inp, 1.0) @@ -1094,18 +1082,18 @@ def arange( dtype="float32", device: Optional[CompNode] = None, ) -> Tensor: - r"""Returns evenly spaced values within the half-open interval ``[start, stop)`` as a one-dimensional tensor. + r"""Returns evenly spaced values within the half-open interval ``[start, stop)`` as a one-dimensional tensor. Note: - This function cannot guarantee that the interval does not include the stop value in those cases + This function cannot guarantee that the interval does not include the stop value in those cases where step is not an integer and floating-point rounding errors affect the length of the output tensor. Args: - start: if ``stop`` is specified, the start of interval (inclusive); otherwise, - the end of the interval (exclusive). If ``stop`` is not specified, the default starting value is ``0``. + start: if ``stop`` is specified, the start of interval (inclusive); otherwise, + the end of the interval (exclusive). If ``stop`` is not specified, the default starting value is ``0``. stop: the end of the interval. Default: ``None``. - step: the distance between two adjacent elements ( ``out[i+1] - out[i]`` ). Must not be 0 ; - may be negative, this results i an empty tensor if stop >= start . Default: 1 . + step: the distance between two adjacent elements ( ``out[i+1] - out[i]`` ). Must not be 0 ; + may be negative, this results i an empty tensor if stop >= start . Default: 1 . Keyword args: dtype( :attr:`.Tensor.dtype` ): output tensor data type. Default: ``float32``. @@ -1114,7 +1102,7 @@ def arange( Returns: A one-dimensional tensor containing evenly spaced values. - The length of the output tensor must be ``ceil((stop-start)/step)`` + The length of the output tensor must be ``ceil((stop-start)/step)`` if ``stop - start`` and ``step`` have the same sign, and length 0 otherwise. Examples: diff --git a/imperative/python/megengine/functional/vision.py b/imperative/python/megengine/functional/vision.py index 7ab6bf24..0fd905ee 100644 --- a/imperative/python/megengine/functional/vision.py +++ b/imperative/python/megengine/functional/vision.py @@ -420,6 +420,7 @@ def warp_affine( Here all available options for params are listed, however it does not mean that you can use all the combinations. On different platforms, different combinations are supported. + ``warp_affine`` only support forward inference, Please refer to ``warp_perspective`` if backward is needed. """ conv_format = _config._get_actual_op_param(format, _config.__conv_format) diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py index b93fa027..19f51544 100644 --- a/imperative/python/megengine/jit/tracing.py +++ b/imperative/python/megengine/jit/tracing.py @@ -104,6 +104,7 @@ class TensorInfo: "shape", "is_const", "bound_data", + "bound_data_numpy", # resources for execution "varnode", "data_setter", @@ -119,12 +120,18 @@ class TensorInfo: self.shape_read = None self.value_read = None self.bound_data = None + self.bound_data_numpy = None self.data_setter = None self.shape_reader = None self.value_reader = None self.data_reader = None + def get_numpy(self): + if self.bound_data_numpy is None: + self.bound_data_numpy = self.bound_data.numpy() + return self.bound_data_numpy + _io_op_types = {AssertEqual, CollectiveComm, RemoteSend, RemoteRecv} @@ -292,7 +299,7 @@ class trace: # Const op is represented by a str assert isinstance(op_, str) and op_ == "Const" - expected = self._tinfo[ohandles[0]].bound_data.numpy() + expected = self._tinfo[ohandles[0]].get_numpy() shape = value.shape if shape != expected.shape or dtype != expected.dtype: eq = False @@ -369,6 +376,7 @@ class trace: info.dtype = x.dtype info.shape = x.shape info.bound_data = x + info.bound_data_numpy = None info.is_const = True x._mixin_handle = h x._recording = True @@ -612,9 +620,7 @@ class trace: assert info.external assert info.bound_data info.varnode = graph.make_const( - info.bound_data.numpy(), - info.bound_data.dtype, - info.bound_data.device, + info.get_numpy(), info.bound_data.dtype, info.bound_data.device, ) continue @@ -627,7 +633,7 @@ class trace: if info.bound_data: if getattr(info, "is_const", False): info.varnode = graph.make_const( - info.bound_data.numpy(), + info.get_numpy(), info.bound_data.dtype, info.bound_data.device, ) @@ -1060,7 +1066,8 @@ class trace: resize_input: whether resize input image to fit input var shape. input_transform: a python expression to transform the input data. Example: data / np.std(data) - dump_format: using different dump formats. + dump_format: using different dump formats. the open source MegEngine defaults to the FBS + format. internal MegEngine have a choice of FBS and internal proprietary formats Keyword Arguments: @@ -1173,7 +1180,7 @@ class trace: assert info.external assert info.bound_data h2v[h] = graph.make_const( - info.bound_data.numpy(), + info.get_numpy(), dtype=info.dtype, device=dumped_device(info), name=info.name, @@ -1186,7 +1193,7 @@ class trace: assert info.external assert info.bound_data h2v[h] = graph.make_const( - info.bound_data.numpy(), + info.get_numpy(), dtype=info.dtype, device=dumped_device(info), name=info.name, diff --git a/imperative/python/megengine/traced_module/__init__.py b/imperative/python/megengine/traced_module/__init__.py index c906b879..6bbdc668 100644 --- a/imperative/python/megengine/traced_module/__init__.py +++ b/imperative/python/megengine/traced_module/__init__.py @@ -9,6 +9,8 @@ from ..core._imperative_rt.core2 import set_cpp_apply_module_trace from . import compat from ._passes import optimize +from .pytree import register_supported_type +from .tm_config import disable_default_checker, enable_expr_checker from .traced_module import ( TracedModule, _register_all_builtin_module, @@ -23,8 +25,11 @@ set_cpp_apply_module_trace(cpp_apply_module_trace) __all__ = [ "register_as_builtin", + "register_supported_type", "trace_module", "wrap", "TracedModule", "optimize", + "enable_expr_checker", + "disable_default_checker", ] diff --git a/imperative/python/megengine/traced_module/_passes/const_pass.py b/imperative/python/megengine/traced_module/_passes/const_pass.py index 143a704c..0ff3571b 100644 --- a/imperative/python/megengine/traced_module/_passes/const_pass.py +++ b/imperative/python/megengine/traced_module/_passes/const_pass.py @@ -12,7 +12,7 @@ from ...core.ops.builtin import GetVarShape from ...logger import get_logger from ...tensor import Tensor from ..expr import Constant, Expr, is_apply_def, is_constant, is_getattr -from ..node import Node, TensorNode +from ..node import Node, NodeMixin, TensorNode from .matcher import PatternMatcher from .pass_base import BackwardPass, ForwardPass, register_pass from .pattern import is_op @@ -21,6 +21,12 @@ from .utils import get_const_value logger = get_logger(__name__) +def _as_const_node(x): + node = Constant.make(x) + NodeMixin.wrap(x, node) + return node + + @register_pass("AttrToConstant") class AttrToConstant(BackwardPass): r"""Convert :class:`~.GetAttr` to :class:`~.Constant` expr.""" @@ -35,10 +41,10 @@ class AttrToConstant(BackwardPass): orig_node = expr.outputs[0] name = orig_node.name with graph.insert_exprs(expr): - const_node = Constant.make(value, name=name) + const_node = _as_const_node(value) graph.replace_node({orig_node: const_node}) graph.compile() - name = orig_node.name + const_node.name = name return const_node.expr @@ -53,7 +59,7 @@ class FixInputShape(BackwardPass): shape = Tensor(expr.inputs[0].shape, dtype="int32") graph = expr.top_graph with graph.insert_exprs(expr): - const_shape = Constant.make(shape) + const_shape = _as_const_node(shape) graph.replace_node({expr.outputs[0]: const_shape}) graph.compile() const_shape.name = expr.outputs[0].name @@ -73,7 +79,7 @@ class FlodConstant(ForwardPass): const_var = expr.interpret(*[get_const_value(n.expr) for n in expr.inputs])[0] graph = expr.top_graph with graph.insert_exprs(expr): - const_node = Constant.make(const_var) + const_node = _as_const_node(const_var) graph.replace_node({expr.outputs[0]: const_node}) graph.compile() const_node.name = expr.outputs[0].name diff --git a/imperative/python/megengine/traced_module/checker.py b/imperative/python/megengine/traced_module/checker.py new file mode 100644 index 00000000..31fa0470 --- /dev/null +++ b/imperative/python/megengine/traced_module/checker.py @@ -0,0 +1,142 @@ +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import traceback +from typing import Sequence + +import numpy as np + +from ..core._imperative_rt.core2 import apply +from ..core._imperative_rt.ops import ROIAlign, ROIPooling +from ..core.ops.builtin import Copy +from ..core.tensor.utils import isscalar, setscalar +from ..tensor import Tensor +from .tm_config import _exclude_from_trace + + +class TracedModuleChecker: + def __init__(self, tracer): + self._active_node2values = [] + self.tracer = tracer + + self.node_without_tensor_info = {} + + def push_scope(self): + self._active_node2values.append({}) + + def pop_scope(self): + self._active_node2values.pop() + + def current_node2values(self): + return self._active_node2values[-1] + + def reset_checker(self): + self._active_node2values = [] + + def check_node_not_in_scope(self): + if self.node_without_tensor_info: + for node, info in self.node_without_tensor_info.items(): + for expr in info[0]._exprs: + if node in expr.inputs or node in expr.outputs: + traceback.print_list(info[1]) + raise ValueError( + "node({}) not in the graph:\n{}".format(node, info[0]) + ) + return True + else: + return False + + def check_net_outputs(self, tm_res, gt_res): + if isinstance(tm_res, Tensor): + np.testing.assert_allclose(tm_res.numpy(), gt_res.numpy()) + elif isinstance(tm_res, Sequence): + for i, j in zip(tm_res, gt_res): + np.testing.assert_allclose(i.numpy(), j.numpy()) + else: + for k in tm_res.__dict__.keys(): + np.testing.assert_allclose( + getattr(tm_res, k).numpy(), getattr(gt_res, k).numpy() + ) + + def record_nodemixin(self, node, value): + self.current_node2values()[node] = value + + def record_node2value(self, node, value): + with _exclude_from_trace(): + self.current_node2values()[node] = apply( + Copy(comp_node=value.device), value + )[0] + if isscalar(value): + setscalar(self.current_node2values()[node]) + + def check_apply_special_cases(self, opdef, num_outputs): + indexs = list(range(num_outputs)) + if isinstance(opdef, ROIAlign) and opdef.mode == ROIAlign.Mode.AVERAGE: + indexs.pop(-1) + if isinstance(opdef, ROIPooling) and opdef.mode == ROIPooling.Mode.AVERAGE: + indexs.pop(-1) + return indexs + + def check_expr_results(self, expr_outputs, gt_outputs, indexs=None): + expr_outputs = ( + (expr_outputs,) if not isinstance(expr_outputs, Sequence) else expr_outputs + ) + gt_outputs = ( + (gt_outputs,) if not isinstance(gt_outputs, Sequence) else gt_outputs + ) + if indexs is not None: + for i in indexs: + np.testing.assert_allclose( + expr_outputs[i].numpy(), gt_outputs[i].numpy() + ) + else: + np.testing.assert_allclose(expr_outputs, gt_outputs) + + def get_node2value(self, inputs, start_idx=0): + inp_values = [] + has_node_not_in_scope = False + for i in range(start_idx, len(inputs)): + try: + inp_values.append(self.current_node2values()[inputs[i]]) + except: + has_node_not_in_scope = True + self.node_without_tensor_info[inputs[i]] = [ + self.tracer.current_scope(), + traceback.extract_stack(), + ] + return inp_values, has_node_not_in_scope + + def check_expr_interpret(self, expr, gt_outputs): + ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs) + if not has_node_not_in_scope: + expr_res = expr.interpret(*ori_in) + try: + self.check_expr_results(expr_res, gt_outputs) + except: + raise ValueError("Error occurred when checking expr: {}".format(expr)) + + def check_apply(self, expr, gt_outputs, opdef): + ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs) + if not has_node_not_in_scope: + expr_res = expr.interpret(*ori_in) + indexs = self.check_apply_special_cases(opdef, len(gt_outputs)) + try: + self.check_expr_results(expr_res, gt_outputs, indexs=indexs) + except: + raise ValueError("Error occurred when checking expr: {}".format(expr)) + + def check_builtin_module(self, module, expr, gt_outputs): + ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs, start_idx=1) + if not has_node_not_in_scope: + ori_in.insert(0, module) + expr_res = expr.interpret(*ori_in) + try: + self.check_expr_results(expr_res, gt_outputs) + except: + raise ValueError( + "{}, Error occurred when checking expr: {}".format(expr) + ) diff --git a/imperative/python/megengine/traced_module/expr.py b/imperative/python/megengine/traced_module/expr.py index b7e7c077..c22249fc 100644 --- a/imperative/python/megengine/traced_module/expr.py +++ b/imperative/python/megengine/traced_module/expr.py @@ -32,6 +32,7 @@ from .module_tracer import active_module_tracer, module_tracer from .node import ModuleNode, Node, NodeMixin, TensorNode from .pytree import ArgsIndex, TreeDef, _is_const_leaf, _is_leaf, tree_flatten from .serialization import _ModuleState +from .tm_config import _exclude_from_trace, _get_expr_checker from .utils import _check_builtin_module_attr, _check_obj_attr, _convert_kwargs_to_args @@ -611,6 +612,8 @@ class Apply(Expr): inp_nodes = [NodeMixin.get(inputs[0])] for i in inputs[1:]: node = Constant.make(i) + if _get_expr_checker(): + active_module_tracer().checker.record_node2value(node, Tensor(i)) inp_nodes.append(node) apply_node = cls.make(opdef) for n in inp_nodes: @@ -624,11 +627,17 @@ class Apply(Expr): unset_module_tracing() outputs = apply(opdef, *inputs) + outputs = list(map(Tensor, outputs)) set_module_tracing() apply_node.add_outputs(outputs) for n, v in zip(apply_node.outputs, outputs): NodeMixin.wrap_safe(v, n) + + if _get_expr_checker(): + with _exclude_from_trace(): + active_module_tracer().checker.check_apply(apply_node, outputs, opdef) + return list(outputs) @@ -754,6 +763,7 @@ class Constant(Expr): current_graph = active_module_tracer().current_scope() current_graph._namespace.auto_naming_for_outputs(expr) current_graph._insert(expr) + active_module_tracer().current_constant_cache().append(expr.value) return expr.outputs[0] def interpret(self, *inputs): diff --git a/imperative/python/megengine/traced_module/module_tracer.py b/imperative/python/megengine/traced_module/module_tracer.py index db2bf055..70a020f4 100644 --- a/imperative/python/megengine/traced_module/module_tracer.py +++ b/imperative/python/megengine/traced_module/module_tracer.py @@ -12,6 +12,7 @@ from .. import functional as F from ..core.tensor.array_method import ArrayMethodMixin from ..module import Module from ..module.qat import QATModule +from .checker import TracedModuleChecker _active_module_tracer = None @@ -92,7 +93,6 @@ BUILTIN_TENSOR_WRAP_METHOD = [ "dtype", "grad", "item", - "name", "ndim", "numpy", "qparams", @@ -129,7 +129,9 @@ class module_tracer: def __init__(self, wrap_fn): self._active_scopes = [] + self.checker = TracedModuleChecker(self) self.patcher = Patcher(wrap_fn) + self._activate_constant_cache = [] @classmethod def register_as_builtin(cls, mod): @@ -143,15 +145,32 @@ class module_tracer: def push_scope(self, scope): self._active_scopes.append(scope) + self.checker.push_scope() + self._activate_constant_cache.append([]) def pop_scope(self): self._active_scopes.pop() + self.checker.pop_scope() + cache = self._activate_constant_cache.pop() + for obj in cache: + if hasattr(obj, "_NodeMixin__node"): + delattr(obj, "_NodeMixin__node") def current_scope(self): if self._active_scopes: return self._active_scopes[-1] return None + def current_constant_cache(self): + if self._activate_constant_cache: + return self._activate_constant_cache[-1] + return None + + def top_scope(self): + if self._active_scopes: + return self._active_scopes[0] + return None + class NotExist: pass diff --git a/imperative/python/megengine/traced_module/node.py b/imperative/python/megengine/traced_module/node.py index 4bfff4af..079ee46e 100644 --- a/imperative/python/megengine/traced_module/node.py +++ b/imperative/python/megengine/traced_module/node.py @@ -18,6 +18,8 @@ from ..core._imperative_rt.core2 import Tensor as RawTensor from ..module import Module from ..quantization.utils import QParams from ..tensor import Tensor +from .module_tracer import active_module_tracer +from .tm_config import _get_expr_checker from .utils import _check_obj_attr logger = get_logger(__name__) @@ -343,6 +345,11 @@ class NodeMixin(abc.ABC): if isinstance(value, NodeMixin): value._record_wrapped_nodes(node) setattr(value, "_NodeMixin__node", node) + if _get_expr_checker(): + if isinstance(value, RawTensor): + active_module_tracer().checker.record_node2value(node, value) + if isinstance(value, NodeMixin): + active_module_tracer().checker.record_nodemixin(node, value) else: assert callable(node) n = node() @@ -352,6 +359,11 @@ class NodeMixin(abc.ABC): if isinstance(value, NodeMixin): value._record_wrapped_nodes(n) setattr(value, "_NodeMixin__node", n) + if _get_expr_checker(): + if isinstance(value, RawTensor): + active_module_tracer().checker.record_node2value(n, value) + if isinstance(value, NodeMixin): + active_module_tracer().checker.record_nodemixin(n, value) @classmethod def wrap_safe(cls, value, node): @@ -359,10 +371,20 @@ class NodeMixin(abc.ABC): if isinstance(value, RawTensor): cls._record_tensornode_property(node, value) setattr(value, "_NodeMixin__node", node) + if _get_expr_checker(): + if isinstance(value, RawTensor): + active_module_tracer().checker.record_node2value(node, value) + if isinstance(value, NodeMixin): + active_module_tracer().checker.record_nodemixin(node, value) if isinstance(value, NodeMixin): value._record_wrapped_nodes(node) @classmethod + def clear_node(cls, value): + if hasattr(value, "_NodeMixin__node"): + delattr(value, "_NodeMixin__node") + + @classmethod def get(cls, value, *default): return getattr(value, "_NodeMixin__node", *default) diff --git a/imperative/python/megengine/traced_module/pytree.py b/imperative/python/megengine/traced_module/pytree.py index 98d19f1e..c4b132fa 100644 --- a/imperative/python/megengine/traced_module/pytree.py +++ b/imperative/python/megengine/traced_module/pytree.py @@ -10,7 +10,7 @@ import collections from collections import OrderedDict, defaultdict from functools import partial from inspect import FullArgSpec -from typing import Callable, NamedTuple +from typing import Any, Callable, Dict, List, NamedTuple, Tuple import numpy as np @@ -46,6 +46,8 @@ SUPPORTED_LEAF_TYPE = { int, float, bool, + bytes, + bytearray, QuantDtypeMeta, CompNode, Device, @@ -74,18 +76,51 @@ SUPPORTED_LEAF_CLS = [ NodeType = NamedTuple("NodeType", [("flatten", Callable), ("unflatten", Callable)]) -def register_supported_type(type, flatten=None, unflatten=None): +def register_supported_type( + type, + flatten_fn: Callable[[Any], Tuple[List, Any]] = None, + unflatten_fn: Callable[[List, Any], Any] = None, +): + r"""Call this function to register the ``type`` as a built-in type. The registered ``type`` + can be used and serialized correctly in :py:class:`TracedModule`. + + Examples: + .. code-block:: + + def dict_flatten(obj: Dict): + context, values = [], [] + # obj.keys() needs to be sortable + keys = sorted(obj.keys()) + for key in keys: + values.append(obj[key]) + context.append(key) + return values, tuple(context) + + def dict_unflatten(values: List, context: Any): + return dict(zip(context, values)) + + register_supported_type(dict, dict_flatten, dict_unflatten) + + Args: + type: the type that needs to be registered. + flatten_fn: a function that should take an object created from ``type`` and return a + flat list of values. It can also return some context that is used in reconstructing + the object. Default: None + unflatten_fn: a function that should take a flat list of values and some context + (returned by flatten_fn). It returns the object by reconstructing + it from the list and the context. Default: None + """ tp_info = (type.__module__, type.__qualname__) - if flatten and unflatten: + if flatten_fn and unflatten_fn: USER_REGISTERED_CONTAINER_TYPE.append(tp_info) else: USER_REGISTERED_LEAF_TYPE.append(tp_info) - _register_supported_type(type, flatten, unflatten) + _register_supported_type(type, flatten_fn, unflatten_fn) -def _register_supported_type(type, flatten=None, unflatten=None): - if flatten and unflatten: - SUPPORTED_TYPE[type] = NodeType(flatten, unflatten) +def _register_supported_type(type, flatten_fn=None, unflatten_fn=None): + if flatten_fn and unflatten_fn: + SUPPORTED_TYPE[type] = NodeType(flatten_fn, unflatten_fn) else: SUPPORTED_LEAF_CLS.append(type) @@ -131,6 +166,7 @@ _register_supported_type( _register_supported_type( OrderedDict, partial(_dict_flatten, True), partial(_dict_unflatten, OrderedDict) ) + _register_supported_type( slice, lambda x: ([x.start, x.stop, x.step], None), @@ -176,7 +212,11 @@ def tree_flatten( to reconstruct the pytree. """ if type(values) not in SUPPORTED_TYPE: - assert is_leaf(values), values + assert is_leaf( + values + ), 'doesn\'t support {} type, MUST use "register_supported_type" method to register self-defined type'.format( + values + ) node = LeafDef(leaf_type(values)) if is_const_leaf(values): node.const_val = values @@ -244,8 +284,43 @@ class TreeDef: and self.children_defs == other.children_defs ) + def _args_kwargs_repr(self): + if ( + len(self.children_defs) == 2 + and issubclass(self.children_defs[0].type, (List, Tuple)) + and issubclass(self.children_defs[1].type, Dict) + ): + args_def = self.children_defs[0] + content = ", ".join(repr(i) for i in args_def.children_defs) + kwargs_def = self.children_defs[1] + if kwargs_def.aux_data: + content += ", " + content += ", ".join( + str(i) + "=" + repr(j) + for i, j in zip(kwargs_def.aux_data, kwargs_def.children_defs) + ) + return content + else: + return repr(self) + def __repr__(self): - return "{}[{}]".format(self.type.__name__, self.children_defs) + format_str = self.type.__name__ + "({})" + aux_data_delimiter = "=" + if issubclass(self.type, List): + format_str = "[{}]" + if issubclass(self.type, Tuple): + format_str = "({})" + if issubclass(self.type, Dict): + format_str = "{{{}}}" + aux_data_delimiter = ":" + if self.aux_data: + content = ", ".join( + repr(i) + aux_data_delimiter + repr(j) + for i, j in zip(self.aux_data, self.children_defs) + ) + else: + content = ", ".join(repr(i) for i in self.children_defs) + return format_str.format(content) class LeafDef(TreeDef): @@ -275,6 +350,9 @@ class LeafDef(TreeDef): return hash(tuple([self.type, self.const_val])) def __repr__(self): - return "Leaf({}[{}])".format( - ", ".join(t.__name__ for t in self.type), self.const_val + + return "{}".format( + self.const_val + if self.const_val is not None or type(None) in self.type + else self.type[0].__name__ ) diff --git a/imperative/python/megengine/traced_module/tm_config.py b/imperative/python/megengine/traced_module/tm_config.py new file mode 100644 index 00000000..6453a05e --- /dev/null +++ b/imperative/python/megengine/traced_module/tm_config.py @@ -0,0 +1,55 @@ +import contextlib + +from ..core._imperative_rt.core2 import ( + is_tracing_module, + set_module_tracing, + unset_module_tracing, +) + +_enable_expr_checker = False +_enable_default_checker = True + + +def _get_expr_checker(): + return _enable_expr_checker + + +def _get_default_checker(): + return _enable_default_checker + + +def enable_expr_checker(): + r"""Call this function to check the result of each expr during tracing.""" + global _enable_expr_checker + _enable_expr_checker = True + _enable_default_checker = False + + +def disable_default_checker(): + r"""Call this function to disable checking the final output of the model after tracing.""" + global _enable_default_checker + _enable_default_checker = False + + +_enable_graph_surgery_mode = False + + +def _graph_surgery_mode(): + return _enable_graph_surgery_mode + + +def _set_graph_surgery_mode(mode: bool): + global _enable_graph_surgery_mode + pre_mode = _enable_graph_surgery_mode + _enable_graph_surgery_mode = mode + return pre_mode + + +@contextlib.contextmanager +def _exclude_from_trace(): + is_tracing = is_tracing_module() + if is_tracing: + unset_module_tracing() + yield + if is_tracing: + set_module_tracing() diff --git a/imperative/python/megengine/traced_module/traced_module.py b/imperative/python/megengine/traced_module/traced_module.py index 8092f6c9..670ab7e9 100644 --- a/imperative/python/megengine/traced_module/traced_module.py +++ b/imperative/python/megengine/traced_module/traced_module.py @@ -36,12 +36,16 @@ from .. import get_logger from .. import module as M from ..core._imperative_rt.core2 import Tensor as RawTensor from ..core._imperative_rt.core2 import ( + apply, is_tracing_module, set_module_tracing, unset_module_tracing, ) from ..core._trace_option import set_symbolic_shape +from ..core.ops.builtin import Copy +from ..core.tensor.utils import isscalar, setscalar from ..module import Module +from ..module import external as MExternal from ..module.qat import QATModule from ..quantization.fake_quant import LSQ, TQT, FakeQuantize, _FakeQuantize from ..quantization.observer import ( @@ -54,6 +58,7 @@ from ..quantization.observer import ( SyncMinMaxObserver, ) from ..tensor import Tensor +from ..utils.max_recursion_limit import max_recursion_limit from ..version import __version__ from .expr import ( Apply, @@ -97,6 +102,13 @@ from .serialization import ( load_call_tensor_method_expr, load_functional, ) +from .tm_config import ( + _exclude_from_trace, + _get_default_checker, + _get_expr_checker, + _graph_surgery_mode, + _set_graph_surgery_mode, +) from .utils import ( _check_builtin_module_attr, _check_obj_attr, @@ -116,26 +128,14 @@ def _is_builtin_name(name: str) -> bool: def _is_leaf(node): - assert isinstance(node, RawTensor), "doesn't support {} in return values".format( + assert isinstance( + node, RawTensor + ), 'doesn\'t support {} in return values, MUST use Tensor or use "register_supported_type" method to register self-defined type'.format( type(node) ) return isinstance(node, RawTensor) -_enable_graph_surgery_mode = False - - -def _graph_surgery_mode(): - return _enable_graph_surgery_mode - - -def _set_graph_surgery_mode(mode: bool): - global _enable_graph_surgery_mode - pre_mode = _enable_graph_surgery_mode - _enable_graph_surgery_mode = mode - return pre_mode - - def _node_to_tensor(*args, **kwargs): tensors = [] nodes, tree_def = tree_flatten((args, kwargs)) @@ -179,6 +179,25 @@ def _tensor_to_node(tensors): return nodes +def _name_setter(node: Node, new_name: str): + surgery_mode = _set_graph_surgery_mode(False) + graph = active_module_tracer().current_scope() + + if node.top_graph is not None: + top_graph = active_module_tracer().top_scope() + if node is top_graph._namespace.used_names.get(node._name, None): + graph = top_graph + else: + graph = node.top_graph + + assert ( + graph._namespace.used_names.get(new_name, None) is None + ), "The name(%s) is already in use. Please try a different one again." % (new_name) + graph._namespace.unassociate_name_with_obj(node) + node._name = graph._namespace.create_unique_name(new_name, node) + _set_graph_surgery_mode(surgery_mode) + + def _wrap_method_to_tensor_node(): def _any_method(name, func): def _any(*args, **kwargs): @@ -207,10 +226,15 @@ def _wrap_method_to_tensor_node(): for method in get_tensor_wrapable_method(): patch = PatchedFn(TensorNode, method) if type(getattr(Tensor, method)) == property: + # Only support property.getter patch.set_func(property(_any_method(method, patch.origin_fn))) else: patch.set_func(_any_method(method, patch.origin_fn)) tensor_method_patch.append(patch) + + patch = PatchedFn(Node, "name") + patch.set_func(property(patch.origin_fn.fget, _name_setter)) + tensor_method_patch.append(patch) return tensor_method_patch @@ -351,14 +375,14 @@ class _InsertExprs: assert ( node.top_graph == self.graph ), "The input node ({}) is not in the graph ({})".format(node, self.graph) - if isinstance(node, TensorNode) and node.expr in self.graph._exprs: + if node.expr in self.graph._exprs: max_inp_expr_idx = max( max_inp_expr_idx, self.graph._exprs.index(node.expr) ) max_inp_expr_idx += 1 insert_index = -1 - if self.expr is not None: + if self.expr in self.graph._exprs: insert_index = self.graph._exprs.index(self.expr) insert_index += 1 @@ -1224,17 +1248,18 @@ class InternalGraph: return result def __deepcopy__(self, memo): - if id(self) in memo: - return memo[id(self)] - cls = self.__class__ - result = cls.__new__(cls) - state = {} - memo[id(self)] = result - for k, v in self.__dict__.items(): - if not isinstance(v, weakref.ReferenceType): - state[k] = copy.deepcopy(v, memo) - result.__dict__.update(state) - return result + with max_recursion_limit(): + if id(self) in memo: + return memo[id(self)] + cls = self.__class__ + result = cls.__new__(cls) + state = {} + memo[id(self)] = result + for k, v in self.__dict__.items(): + if not isinstance(v, weakref.ReferenceType): + state[k] = copy.deepcopy(v, memo) + result.__dict__.update(state) + return result def _get_meth_name(obj, func): @@ -1270,7 +1295,12 @@ def _wrapped_function(orig_func): return orig_func(*args, **kwargs) if isinstance(args[1], RawTensor): node = NodeMixin.get(inputs[1]) - inputs[1] = copy.copy(inputs[1]) + is_scalar = isscalar(inputs[1]) + inputs[1] = apply( + Copy(comp_node=inputs[1].device), Tensor(inputs[1]) + )[0] + if is_scalar: + setscalar(inputs[1]) # copy inputs[1] to avoid tensor and Tensor(tensor) share same m_tensor, # which will cause they have same _NodeMixin__node in tracing. NodeMixin.wrap_safe(inputs[1], node) @@ -1294,6 +1324,13 @@ def _wrapped_function(orig_func): else: outputs = None call_node.add_outputs(outputs) + + if _get_expr_checker(): + with _exclude_from_trace(): + active_module_tracer().checker.check_expr_interpret( + call_node, outputs + ) + set_module_tracing() return rst return orig_func(*args, **kwargs) @@ -1475,6 +1512,12 @@ class TracedModuleBuilder(NodeMixin): unset_module_tracing() rst = self._mod(*args, **kwargs) outputs, out_def = tree_flatten(rst, is_leaf=_is_leaf) + if _get_expr_checker(): + with _exclude_from_trace(): + tmp = self.build() + active_module_tracer().checker.check_builtin_module( + tmp, callnode, outputs + ) set_module_tracing() if self._is_builtin: self._body = None @@ -1649,7 +1692,9 @@ class TracedModuleBuilder(NodeMixin): if not isinstance(mod_attr, (List, Dict, QATModule)): assert mod_attr is wrapped._mod else: - assert mod_attr is wrapped + assert ( + mod_attr is wrapped + ), "TracedModule do not support modify attributes, please check your code." if isinstance(wrapped, (NodeMixin, RawTensor)): NodeMixin.wrap( @@ -1934,7 +1979,15 @@ class TracedModule(Module): if hasattr(self, "argspec") and self.argspec is not None: args, kwargs = _convert_kwargs_to_args(self.argspec, args, kwargs, True) inputs, treedef = tree_flatten(((self, *args), kwargs)) - assert treedef in self.argdef_graph_map + assert ( + treedef in self.argdef_graph_map + ), "support input args kwargs format: \n{}, but get: \n{}".format( + "\n ".join( + "forward({})".format(i._args_kwargs_repr()) + for i in self.argdef_graph_map.keys() + ), + treedef._args_kwargs_repr(), + ) inputs = filter( lambda i: isinstance(i, (Module, TracedModuleBuilder, RawTensor)), inputs ) # allow TracedModuleBuilder for retrace. @@ -2070,7 +2123,8 @@ class TracedModule(Module): for inp_def, graph in self.argdef_graph_map.items(): if top_graph is not None: graph._top_graph = weakref.ref(top_graph) - for n in graph._inputs + graph.outputs: + for n in graph._inputs + graph._outputs: + n.expr._top_graph = weakref.ref(graph) n._top_graph = weakref.ref(graph) graph._inputs[0]._owner = weakref.ref(self) for i, n in enumerate(graph._inputs): @@ -2307,16 +2361,17 @@ class TracedModule(Module): return result def __deepcopy__(self, memo): - cls = self.__class__ - result = cls.__new__(cls) - state = {} - memo[id(self)] = result - for k, v in self.__dict__.items(): - if not isinstance(v, weakref.ReferenceType): - state[k] = copy.deepcopy(v, memo) - result.__dict__.update(state) - result._update_ref() - return result + with max_recursion_limit(): + cls = self.__class__ + result = cls.__new__(cls) + state = {} + memo[id(self)] = result + for k, v in self.__dict__.items(): + if not isinstance(v, weakref.ReferenceType): + state[k] = copy.deepcopy(v, memo) + result.__dict__.update(state) + result._update_ref() + return result def cpp_apply_module_trace(opdef, *args): @@ -2375,7 +2430,7 @@ def wrap(func: Callable): def _register_all_builtin_module(): - for sub_mod in [M, M.qat, M.quantized]: + for sub_mod in [M, M.qat, M.quantized, MExternal]: for m in getmembers(sub_mod): if ( isclass(m[1]) @@ -2443,13 +2498,29 @@ def trace_module( qualname="{}.[{}]".format(net_name, "arg_{}".format(_)), ), ) - builder(*args, **kwargs) + rst = builder(*copy.deepcopy(args), **copy.deepcopy(kwargs)) active_module_tracer().pop_scope() traced_mod = builder.build() traced_mod.argspec = forward_argspec traced_mod.graph._reset_ids() + + has_expr_not_check = False + if _get_expr_checker(): + has_expr_not_check = ( + active_module_tracer().checker.check_node_not_in_scope() + ) + if _get_default_checker() or has_expr_not_check: + with _exclude_from_trace(): + tm_res = traced_mod(*args, **kwargs) + tm_res, _ = tree_flatten(tm_res, is_leaf=_is_leaf) + rst, _ = tree_flatten(rst, is_leaf=_is_leaf) + active_module_tracer().checker.check_net_outputs(tm_res, rst) return traced_mod finally: set_symbolic_shape(use_sym_shape) set_active_module_tracer(None) unset_module_tracing() + for t in mod.tensors(recursive=True): + NodeMixin.clear_node(t) + for t in inputs: + NodeMixin.clear_node(t) diff --git a/imperative/python/megengine/traced_module/utils.py b/imperative/python/megengine/traced_module/utils.py index 21ccb35c..d93b658f 100644 --- a/imperative/python/megengine/traced_module/utils.py +++ b/imperative/python/megengine/traced_module/utils.py @@ -5,16 +5,15 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -import collections import copy import inspect from collections.abc import MutableMapping, MutableSequence from inspect import FullArgSpec -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Type, Union +from typing import Callable, Dict, Iterable, List, Optional, Sequence, Union from .. import get_logger from ..module import Module -from ..tensor import Parameter, Tensor +from ..tensor import Tensor logger = get_logger(__name__) @@ -126,10 +125,12 @@ def _check_obj_attr(obj): for _, v in obj.items(): leafs, _ = tree_flatten(v, is_leaf=lambda _: True) for leaf in leafs: - assert _check_leaf_type( - leaf - ), "Type {} is not supported by traced module".format( - leaf if isinstance(leaf, type) else type(leaf) + assert _check_leaf_type(leaf), ( + "Type {} is not supported in TracedModule serialization by default. " + "If you want to save this object to file, please call tm.register_supported_type({}) " + "before saving.".format( + leaf if isinstance(leaf, type) else type(leaf), type(leaf).__name__ + ) ) diff --git a/imperative/python/megengine/utils/custom_op_tools.py b/imperative/python/megengine/utils/custom_op_tools.py new file mode 100644 index 00000000..d9150fef --- /dev/null +++ b/imperative/python/megengine/utils/custom_op_tools.py @@ -0,0 +1,909 @@ +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import collections +import ctypes +import glob +import os +import re +import subprocess +import sys +import time +from typing import List, Optional, Union + +from ..core.ops.custom import load +from ..logger import get_logger + + +def _get_win_folder_with_ctypes(csidl_name): + csidl_const = { + "CSIDL_APPDATA": 26, + "CSIDL_COMMON_APPDATA": 35, + "CSIDL_LOCAL_APPDATA": 28, + }[csidl_name] + + buf = ctypes.create_unicode_buffer(1024) + ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf) + + # Downgrade to short path name if have highbit chars. See + # . + has_high_char = False + for c in buf: + if ord(c) > 255: + has_high_char = True + break + if has_high_char: + buf2 = ctypes.create_unicode_buffer(1024) + if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024): + buf = buf2 + + return buf.value + + +system = sys.platform +if system == "win32": + _get_win_folder = _get_win_folder_with_ctypes + +PLAT_TO_VCVARS = { + "win-amd64": "x86_amd64", +} + +logger = get_logger() + +# environment varible +ev_custom_op_root_dir = "MGE_CUSTOM_OP_DIR" +ev_cuda_root_dir = "CUDA_ROOT_DIR" +ev_cudnn_root_dir = "CUDNN_ROOT_DIR" + +# operating system +IS_WINDOWS = system == "win32" +IS_LINUX = system == "linux" +IS_MACOS = system == "darwin" + +MGE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +MGE_INC_PATH = os.path.join(MGE_PATH, "core", "include") +MGE_LIB_PATH = os.path.join(MGE_PATH, "core", "lib") +MGE_ABI_VER = 0 + + +# compile version +MINIMUM_GCC_VERSION = (5, 0, 0) +MINIMUM_CLANG_CL_VERSION = (12, 0, 1) + +# compile flags +COMMON_MSVC_FLAGS = [ + "/MD", + "/wd4002", + "/wd4819", + "/EHsc", +] + +MSVC_IGNORE_CUDAFE_WARNINGS = [ + "field_without_dll_interface", +] + +COMMON_NVCC_FLAGS = [] + +# Finds the CUDA install path +def _find_cuda_root_dir() -> Optional[str]: + cuda_root_dir = os.environ.get(ev_cuda_root_dir) + if cuda_root_dir is None: + try: + which = "where" if IS_WINDOWS else "which" + with open(os.devnull, "w") as devnull: + nvcc = ( + subprocess.check_output([which, "nvcc"], stderr=devnull) + .decode() + .rstrip("\r\n") + ) + cuda_root_dir = os.path.dirname(os.path.dirname(nvcc)) + except Exception: + if IS_WINDOWS: + cuda_root_dir = os.environ.get("CUDA_PATH", None) + if cuda_root_dir == None: + cuda_root_dirs = glob.glob( + "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*" + ) + if len(cuda_root_dirs) == 0: + cuda_root_dir = "" + else: + cuda_root_dir = cuda_root_dirs[0] + else: + cuda_root_dir = "/usr/local/cuda" + if not os.path.exists(cuda_root_dir): + cuda_root_dir = None + return cuda_root_dir + + +def _find_cudnn_root_dir() -> Optional[str]: + cudnn_root_dir = os.environ.get(ev_cudnn_root_dir) + return cudnn_root_dir + + +CUDA_ROOT_DIR = _find_cuda_root_dir() +CUDNN_ROOT_DIR = _find_cudnn_root_dir() + +##################################################################### +# Phase 1 +##################################################################### + + +def _is_cuda_file(path: str) -> bool: + valid_ext = [".cu", ".cuh"] + return os.path.splitext(path)[1] in valid_ext + + +# Return full path to the user-specific cache dir for this application. +# Typical user cache directories are: +# Mac OS X: ~/Library/Caches/ +# Unix: ~/.cache/ (XDG default) +# Windows: C:\Users\\AppData\Local\\\Cache +def _get_user_cache_dir(appname=None, appauthor=None, version=None, opinion=True): + if system == "win32": + appauthor = appname if appauthor is None else appauthor + path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA")) + if appname: + if appauthor is not False: + path = os.path.join(path, appauthor) + else: + path = os.path.join(path, appname) + if opinion: + path = os.path.join(path, "Cache") + elif system == "darwin": + path = os.path.expanduser("~/Library/Caches") + if appname: + path = os.path.join(path, appname) + else: + path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache")) + if appname: + path = os.path.join(path, appname) + if appname and version: + path = os.path.join(path, version) + return path + + +# Returns the path to the root folder under which custom op will built. +def _get_default_build_root() -> str: + return os.path.realpath(_get_user_cache_dir(appname="mge_custom_op")) + + +def _get_build_dir(name: str) -> str: + custom_op_root_dir = os.environ.get(ev_custom_op_root_dir) + if custom_op_root_dir is None: + custom_op_root_dir = _get_default_build_root() + + build_dir = os.path.join(custom_op_root_dir, name) + return build_dir + + +##################################################################### +# Phase 2 +##################################################################### + + +def update_hash(seed, value): + # using boost::hash_combine + # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html + return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2)) + + +def hash_source_files(hash_value, source_files): + for filename in source_files: + with open(filename) as file: + hash_value = update_hash(hash_value, file.read()) + return hash_value + + +def hash_build_args(hash_value, build_args): + for group in build_args: + for arg in group: + hash_value = update_hash(hash_value, arg) + return hash_value + + +Entry = collections.namedtuple("Entry", "version, hash") + + +class Versioner(object): + def __init__(self): + self.entries = {} + + def get_version(self, name): + entry = self.entries.get(name) + return None if entry is None else entry.version + + def bump_version_if_changed( + self, name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag + ): + hash_value = 0 + hash_value = hash_source_files(hash_value, sources) + hash_value = hash_build_args(hash_value, build_args) + hash_value = update_hash(hash_value, build_dir) + hash_value = update_hash(hash_value, with_cuda) + hash_value = update_hash(hash_value, with_cudnn) + hash_value = update_hash(hash_value, abi_tag) + + entry = self.entries.get(name) + if entry is None: + self.entries[name] = entry = Entry(0, hash_value) + elif hash_value != entry.hash: + self.entries[name] = entry = Entry(entry.version + 1, hash_value) + + return entry.version + + +custom_op_versioner = Versioner() + + +def version_check( + name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag, +): + old_version = custom_op_versioner.get_version(name) + version = custom_op_versioner.bump_version_if_changed( + name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag, + ) + return version, old_version + + +##################################################################### +# Phase 3 +##################################################################### + + +def _check_ninja_availability(): + try: + subprocess.check_output("ninja --version".split()) + except Exception: + raise RuntimeError( + "Ninja is required to build custom op, please install ninja and update your PATH" + ) + + +def _mge_is_built_from_src(): + file_path = os.path.abspath(__file__) + if "site-packages" in file_path: + return False + else: + return True + + +def _accepted_compilers_for_platform(): + if IS_WINDOWS: + return ["clang-cl"] + if IS_MACOS: + return ["clang++", "clang"] + if IS_LINUX: + return ["g++", "gcc", "gnu-c++", "gnu-cc"] + + +# Verifies that the compiler is the expected one for the current platform. +def _check_compiler_existed_for_platform(compiler: str) -> bool: + # there is no suitable cmd like `which` on windows, so we assume the compiler is always true on windows + if IS_WINDOWS: + try: + version_string = subprocess.check_output( + ["clang-cl", "--version"], stderr=subprocess.STDOUT + ).decode() + return True + except Exception: + return False + + # use os.path.realpath to resolve any symlinks, in particular from "c++" to e.g. "g++". + which = subprocess.check_output(["which", compiler], stderr=subprocess.STDOUT) + compiler_path = os.path.realpath(which.decode().strip()) + if any(name in compiler_path for name in _accepted_compilers_for_platform()): + return True + + version_string = subprocess.check_output( + [compiler, "-v"], stderr=subprocess.STDOUT + ).decode() + if sys.platform.startswith("linux"): + pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE) + results = re.findall(pattern, version_string) + if len(results) != 1: + return False + compiler_path = os.path.realpath(results[0].strip()) + return any(name in compiler_path for name in _accepted_compilers_for_platform()) + + if sys.platform.startswith("darwin"): + return version_string.startswith("Apple clang") + + return False + + +# Verifies that the given compiler is ABI-compatible with MegEngine. +def _check_compiler_abi_compatibility(compiler: str): + # we think if the megengine is built from source, the user will use the same compiler to compile the custom op + if _mge_is_built_from_src() or os.environ.get("MGE_CHECK_ABI", "1") == "0": + return True + + # [TODO] There is no particular minimum version we need for clang, so we"re good here. + if sys.platform.startswith("darwin"): + return True + + try: + if sys.platform.startswith("linux"): + minimum_required_version = MINIMUM_GCC_VERSION + versionstr = subprocess.check_output( + [compiler, "-dumpfullversion", "-dumpversion"] + ) + version = versionstr.decode().strip().split(".") + else: + minimum_required_version = MINIMUM_CLANG_CL_VERSION + compiler_info = subprocess.check_output( + [compiler, "--version"], stderr=subprocess.STDOUT + ) + match = re.search(r"(\d+)\.(\d+)\.(\d+)", compiler_info.decode().strip()) + version = (0, 0, 0) if match is None else match.groups() + except Exception: + _, error, _ = sys.exc_info() + logger.warning( + "Error checking compiler version for {}: {}".format(compiler, error) + ) + return False + + if tuple(map(int, version)) >= minimum_required_version: + return True + + return False + + +def _check_compiler_comatibility(): + # we use clang-cl on windows, refer: https://clang.llvm.org/docs/UsersManual.html#clang-cl + compiler = ( + os.environ.get("CXX", "clang-cl") + if IS_WINDOWS + else os.environ.get("CXX", "c++") + ) + + existed = _check_compiler_existed_for_platform(compiler) + if existed == False: + log_str = ( + "Cannot find compiler which is compatible with the compiler " + "MegEngine was built with for this platform, which is {mge_compiler} on " + "{platform}. Please use {mge_compiler} to to compile your extension. " + "Alternatively, you may compile MegEngine from source using " + "{user_compiler}, and then you can also use {user_compiler} to compile " + "your extension." + ).format( + user_compiler=compiler, + mge_compiler=_accepted_compilers_for_platform()[0], + platform=sys.platform, + ) + + logger.warning(log_str) + return False + + compatible = _check_compiler_abi_compatibility(compiler) + if compatible == False: + log_str = ( + "Your compiler version may be ABI-incompatible with MegEngine! " + "Please use a compiler that is ABI-compatible with GCC 5.0 on Linux " + "and LLVM/Clang 12.0 on Windows ." + ) + logger.warning(log_str) + return True + + +##################################################################### +# Phase 4 +##################################################################### + + +# Quote command-line arguments for DOS/Windows conventions. +def _nt_quote_args(args: Optional[List[str]]) -> List[str]: + # Cover None-type + if not args: + return [] + return ['"{}"'.format(arg) if " " in arg else arg for arg in args] + + +# Now we need user to specify the arch of GPU +def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: + return [] + + +def _setup_sys_includes(with_cuda: bool, with_cudnn: bool): + includes = [os.path.join(MGE_INC_PATH)] + if with_cuda: + includes.append(os.path.join(CUDA_ROOT_DIR, "include")) + if with_cudnn: + includes.append(os.path.join(CUDNN_ROOT_DIR, "include")) + return includes + + +def _setup_includes(extra_include_paths: List[str], with_cuda: bool, with_cudnn: bool): + user_includes = [os.path.abspath(path) for path in extra_include_paths] + system_includes = _setup_sys_includes(with_cuda, with_cudnn) + if IS_WINDOWS: + user_includes += system_includes + system_includes.clear() + return user_includes, system_includes + + +def _setup_common_cflags(user_includes: List[str], system_includes: List[str]): + common_cflags = [] + common_cflags += ["-I{}".format(include) for include in user_includes] + common_cflags += ["-isystem {}".format(include) for include in system_includes] + if not IS_WINDOWS: + common_cflags += ["-D_GLIBCXX_USE_CXX11_ABI={}".format(MGE_ABI_VER)] + return common_cflags + + +def _setup_cuda_cflags(cflags: List[str], extra_cuda_cflags: List[str]): + cuda_flags = cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags() + if IS_WINDOWS: + for flag in COMMON_MSVC_FLAGS: + cuda_flags = ["-Xcompiler", flag] + cuda_flags + for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS: + cuda_flags = ["-Xcudafe", "--diag_suppress=" + ignore_warning] + cuda_flags + cuda_flags = _nt_quote_args(cuda_flags) + cuda_flags += _nt_quote_args(extra_cuda_cflags) + else: + cuda_flags += ["--compiler-options", '"-fPIC"'] + cuda_flags += extra_cuda_cflags + if not any(flag.startswith("-std=") for flag in cuda_flags): + cuda_flags.append("-std=c++14") + if os.getenv("CC") is not None: + cuda_flags = ["-ccbin", os.getenv("CC")] + cuda_flags + return cuda_flags + + +def _setup_ldflags( + extra_ldflags: List[str], with_cuda: bool, with_cudnn: bool +) -> List[str]: + ldflags = extra_ldflags + if IS_WINDOWS: + ldflags.append(os.path.join(MGE_LIB_PATH, "megengine_shared.lib")) + if with_cuda: + ldflags.append(os.path.join(CUDA_ROOT_DIR, "lib", "x64", "cudart.lib")) + if with_cudnn: + ldflags.append(os.path.join(CUDNN_ROOT_DIR, "lib", "x64", "cudnn.lib")) + + else: + ldflags.append("-lmegengine_shared -L{}".format(MGE_LIB_PATH)) + ldflags.append("-Wl,-rpath,{}".format(MGE_LIB_PATH)) + if with_cuda: + ldflags.append("-lcudart") + ldflags.append("-L{}".format(os.path.join(CUDA_ROOT_DIR, "lib64"))) + ldflags.append("-Wl,-rpath,{}".format(os.path.join(CUDA_ROOT_DIR, "lib64"))) + if with_cudnn: + ldflags.append("-L{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64"))) + ldflags.append( + "-Wl,-rpath,{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64")) + ) + + return ldflags + + +def _add_shared_flag(ldflags: List[str]): + ldflags += ["/LD" if IS_WINDOWS else "-shared"] + return ldflags + + +##################################################################### +# Phase 5 +##################################################################### + + +def _obj_file_path(src_file_path: str): + file_name = os.path.splitext(os.path.basename(src_file_path))[0] + if _is_cuda_file(src_file_path): + target = "{}.cuda.o".format(file_name) + else: + target = "{}.o".format(file_name) + return target + + +def _dump_ninja_file( + path, + cflags, + post_cflags, + cuda_cflags, + cuda_post_cflags, + sources, + objects, + ldflags, + library_target, + with_cuda, +): + def sanitize_flags(flags): + return [] if flags is None else [flag.strip() for flag in flags] + + cflags = sanitize_flags(cflags) + post_cflags = sanitize_flags(post_cflags) + cuda_cflags = sanitize_flags(cuda_cflags) + cuda_post_cflags = sanitize_flags(cuda_post_cflags) + ldflags = sanitize_flags(ldflags) + + assert len(sources) == len(objects) + assert len(sources) > 0 + + if IS_WINDOWS: + compiler = os.environ.get("CXX", "clang-cl") + else: + compiler = os.environ.get("CXX", "c++") + + # Version 1.3 is required for the `deps` directive. + config = ["ninja_required_version = 1.3"] + config.append("cxx = {}".format(compiler)) + if with_cuda: + nvcc = os.path.join(CUDA_ROOT_DIR, "bin", "nvcc") + config.append("nvcc = {}".format(nvcc)) + + flags = ["cflags = {}".format(" ".join(cflags))] + flags.append("post_cflags = {}".format(" ".join(post_cflags))) + if with_cuda: + flags.append("cuda_cflags = {}".format(" ".join(cuda_cflags))) + flags.append("cuda_post_cflags = {}".format(" ".join(cuda_post_cflags))) + flags.append("ldflags = {}".format(" ".join(ldflags))) + + # Turn into absolute paths so we can emit them into the ninja build + # file wherever it is. + sources = [os.path.abspath(file) for file in sources] + + # See https://ninja-build.org/build.ninja.html for reference. + compile_rule = ["rule compile"] + if IS_WINDOWS: + compile_rule.append( + " command = clang-cl /showIncludes $cflags -c $in /Fo$out $post_cflags" + ) + compile_rule.append(" deps = msvc") + else: + compile_rule.append( + " command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags" + ) + compile_rule.append(" depfile = $out.d") + compile_rule.append(" deps = gcc") + + if with_cuda: + cuda_compile_rule = ["rule cuda_compile"] + nvcc_gendeps = "" + cuda_compile_rule.append( + " command = $nvcc {} $cuda_cflags -c $in -o $out $cuda_post_cflags".format( + nvcc_gendeps + ) + ) + + # Emit one build rule per source to enable incremental build. + build = [] + for source_file, object_file in zip(sources, objects): + is_cuda_source = _is_cuda_file(source_file) and with_cuda + rule = "cuda_compile" if is_cuda_source else "compile" + if IS_WINDOWS: + source_file = source_file.replace(":", "$:") + object_file = object_file.replace(":", "$:") + source_file = source_file.replace(" ", "$ ") + object_file = object_file.replace(" ", "$ ") + build.append("build {}: {} {}".format(object_file, rule, source_file)) + + if library_target is not None: + link_rule = ["rule link"] + if IS_WINDOWS: + link_rule.append(" command = clang-cl $in /nologo $ldflags /out:$out") + else: + link_rule.append(" command = $cxx $in $ldflags -o $out") + + link = ["build {}: link {}".format(library_target, " ".join(objects))] + default = ["default {}".format(library_target)] + else: + link_rule, link, default = [], [], [] + + # 'Blocks' should be separated by newlines, for visual benefit. + blocks = [config, flags, compile_rule] + if with_cuda: + blocks.append(cuda_compile_rule) + blocks += [link_rule, build, link, default] + with open(path, "w") as build_file: + for block in blocks: + lines = "\n".join(block) + build_file.write("{}\n\n".format(lines)) + + +class FileBaton: + def __init__(self, lock_file_path, wait_seconds=0.1): + self.lock_file_path = lock_file_path + self.wait_seconds = wait_seconds + self.fd = None + + def try_acquire(self): + try: + self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL) + return True + except FileExistsError: + return False + + def wait(self): + while os.path.exists(self.lock_file_path): + time.sleep(self.wait_seconds) + + def release(self): + if self.fd is not None: + os.close(self.fd) + + os.remove(self.lock_file_path) + + +##################################################################### +# Phase 6 +##################################################################### + + +def _build_with_ninja(build_dir: str, verbose: bool, error_prefix: str): + command = ["ninja", "-v"] + env = os.environ.copy() + try: + sys.stdout.flush() + sys.stderr.flush() + stdout_fileno = 1 + subprocess.run( + command, + stdout=stdout_fileno if verbose else subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=build_dir, + check=True, + env=env, + ) + except subprocess.CalledProcessError as e: + with open(os.path.join(build_dir, "build.ninja")) as f: + lines = f.readlines() + print(lines) + _, error, _ = sys.exc_info() + message = error_prefix + if hasattr(error, "output") and error.output: + message += ": {}".format(error.output.decode()) + raise RuntimeError(message) from e + + +def build( + name: str, + sources: Union[str, List[str]], + extra_cflags: Union[str, List[str]] = [], + extra_cuda_cflags: Union[str, List[str]] = [], + extra_ldflags: Union[str, List[str]] = [], + extra_include_paths: Union[str, List[str]] = [], + with_cuda: Optional[bool] = None, + build_dir: Optional[bool] = None, + verbose: bool = False, + abi_tag: Optional[int] = None, +) -> str: + r"""Build a Custom Op with ninja in the way of just-in-time (JIT). + + To build the custom op, a Ninja build file is emitted, which is used to + compile the given sources into a dynamic library. + + By default, the directory to which the build file is emitted and the + resulting library compiled to is ``/mge_custom_op/``, where + ```` is the temporary folder on the current platform and ```` + the name of the custom op. This location can be overridden in two ways. + First, if the ``MGE_CUSTOM_OP_DIR`` environment variable is set, it + replaces ``/mge_custom_op`` and all custom op will be compiled + into subfolders of this directory. Second, if the ``build_dir`` + argument to this function is supplied, it overrides the entire path, i.e. + the library will be compiled into that folder directly. + + To compile the sources, the default system compiler (``c++``) is used, + which can be overridden by setting the ``CXX`` environment variable. To pass + additional arguments to the compilation process, ``extra_cflags`` or + ``extra_ldflags`` can be provided. For example, to compile your custom op + with optimizations, pass ``extra_cflags=['-O3']``. You can also use + ``extra_cflags`` to pass further include directories. + + CUDA support with mixed compilation is provided. Simply pass CUDA source + files (``.cu`` or ``.cuh``) along with other sources. Such files will be + detected and compiled with nvcc rather than the C++ compiler. This includes + passing the CUDA lib64 directory as a library directory, and linking + ``cudart``. You can pass additional flags to nvcc via + ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various + heuristics for finding the CUDA install directory are used, which usually + work fine. If not, setting the ``CUDA_ROOT_DIR`` environment variable is the + safest option. If you use CUDNN, please also setting the ``CUDNN_ROOT_DIR`` + environment variable. + + Args: + name: The name of the custom op to build. + sources: A list of relative or absolute paths to C++ source files. + extra_cflags: optional list of compiler flags to forward to the build. + extra_cuda_cflags: optional list of compiler flags to forward to nvcc + when building CUDA sources. + extra_ldflags: optional list of linker flags to forward to the build. + extra_include_paths: optional list of include directories to forward + to the build. + with_cuda: Determines whether CUDA headers and libraries are added to + the build. If set to ``None`` (default), this value is + automatically determined based on the existence of ``.cu`` or + ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers + and libraries to be included. + build_dir: optional path to use as build workspace. + verbose: If ``True``, turns on verbose logging of load steps. + abi_tag: Determines the value of MACRO ``_GLIBCXX_USE_CXX11_ABI`` + in gcc compiler, should be ``0`` or ``1``. + + Returns: + the compiled dynamic library path + + """ + + # phase 1: prepare config + if abi_tag != None: + global MGE_ABI_VER + MGE_ABI_VER = abi_tag + + def strlist(args, name): + assert isinstance(args, str) or isinstance( + args, list + ), "{} must be str or list[str]".format(name) + if isinstance(args, str): + return [args] + for arg in args: + assert isinstance(arg, str) + args = [arg.strip() for arg in args] + return args + + sources = strlist(sources, "sources") + extra_cflags = strlist(extra_cflags, "extra_cflags") + extra_cuda_cflags = strlist(extra_cuda_cflags, "extra_cuda_cflags") + extra_ldflags = strlist(extra_ldflags, "extra_ldflags") + extra_include_paths = strlist(extra_include_paths, "extra_include_paths") + + with_cuda = any(map(_is_cuda_file, sources)) if with_cuda is None else with_cuda + with_cudnn = any(["cudnn" in f for f in extra_ldflags]) + + if CUDA_ROOT_DIR == None and with_cuda: + print( + "No CUDA runtime is found, using {}=/path/to/your/cuda_root_dir".format( + ev_cuda_root_dir + ) + ) + if CUDNN_ROOT_DIR == None and with_cudnn: + print( + "Cannot find the root directory of cudnn, using {}=/path/to/your/cudnn_root_dir".format( + ev_cudnn_root_dir + ) + ) + + build_dir = os.path.abspath( + _get_build_dir(name) if build_dir is None else build_dir + ) + if not os.path.exists(build_dir): + os.makedirs(build_dir, exist_ok=True) + + if verbose: + print("Using {} to build megengine custom op".format(build_dir)) + + # phase 2: version check + version, old_version = version_check( + name, + sources, + [extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths], + build_dir, + with_cuda, + with_cudnn, + abi_tag, + ) + if verbose: + if version != old_version and old_version != None: + print( + "Input conditions of custom op {} have changed, bumping to version {}".format( + name, version + ) + ) + print("Building custom op {} with version {}".format(name, version)) + if version == old_version: + if verbose: + print( + "No modifications detected for {}, skipping build step...".format(name) + ) + return + name = "{}_v{}".format(name, version) + + # phase 3: compiler and ninja check + _check_ninja_availability() + _check_compiler_comatibility() + + # phase 4: setup the compile flags + user_includes, system_includes = _setup_includes( + extra_include_paths, with_cuda, with_cudnn + ) + common_cflags = _setup_common_cflags(user_includes, system_includes) + cuda_cflags = ( + _setup_cuda_cflags(common_cflags, extra_cuda_cflags) if with_cuda else None + ) + ldflags = _setup_ldflags(extra_ldflags, with_cuda, with_cudnn) + + if IS_WINDOWS: + cflags = common_cflags + COMMON_MSVC_FLAGS + extra_cflags + cflags = _nt_quote_args(cflags) + else: + cflags = common_cflags + ["-fPIC", "-std=c++14"] + extra_cflags + + ldflags = _add_shared_flag(ldflags) + if sys.platform.startswith("darwin"): + ldflags.append("-undefined dynamic_lookup") + elif IS_WINDOWS: + ldflags += ["/link"] + ldflags = _nt_quote_args(ldflags) + + baton = FileBaton(os.path.join(build_dir, "lock")) + if baton.try_acquire(): + try: + # phase 5: generate ninja build file + objs = [_obj_file_path(src) for src in sources] + name += ".dll" if IS_WINDOWS else ".so" + + build_file_path = os.path.join(build_dir, "build.ninja") + if verbose: + print("Emitting ninja build file {}".format(build_file_path)) + _dump_ninja_file( + path=build_file_path, + cflags=cflags, + post_cflags=None, + cuda_cflags=cuda_cflags, + cuda_post_cflags=None, + sources=sources, + objects=objs, + ldflags=ldflags, + library_target=name, + with_cuda=with_cuda, + ) + + # phase 6: build with ninja + if verbose: + print( + "Compiling and linking your custom op {}".format( + os.path.join(build_dir, name) + ) + ) + _build_with_ninja(build_dir, verbose, "compiling error") + finally: + baton.release() + else: + baton.wait() + + return os.path.join(build_dir, name) + + +def build_and_load( + name: str, + sources: Union[str, List[str]], + extra_cflags: Union[str, List[str]] = [], + extra_cuda_cflags: Union[str, List[str]] = [], + extra_ldflags: Union[str, List[str]] = [], + extra_include_paths: Union[str, List[str]] = [], + with_cuda: Optional[bool] = None, + build_dir: Optional[bool] = None, + verbose: bool = False, + abi_tag: Optional[int] = None, +) -> str: + r"""Build and Load a Custom Op with ninja in the way of just-in-time (JIT). + Same as the function ``build()`` but load the built dynamic library. + + Args: + same as ``build()`` + + Returns: + the compiled dynamic library path + + """ + + lib_path = build( + name, + sources, + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + with_cuda, + build_dir, + verbose, + abi_tag, + ) + if verbose: + print("Load the compiled custom op {}".format(lib_path)) + load(lib_path) + return lib_path diff --git a/imperative/python/megengine/utils/persistent_cache.py b/imperative/python/megengine/utils/persistent_cache.py index 3b0f7ae2..b6aadf8a 100644 --- a/imperative/python/megengine/utils/persistent_cache.py +++ b/imperative/python/megengine/utils/persistent_cache.py @@ -8,87 +8,115 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. import argparse +import contextlib import getpass import os import sys import urllib.parse -from ..core._imperative_rt import PersistentCacheManager as _PersistentCacheManager +import filelock + +from ..core._imperative_rt import PersistentCache as _PersistentCache from ..logger import get_logger from ..version import __version__, git_version -class PersistentCacheManager(_PersistentCacheManager): +class PersistentCacheOnServer(_PersistentCache): def __init__(self): super().__init__() - if os.getenv("MGE_FASTRUN_CACHE_TYPE") == "MEMORY": - get_logger().info("fastrun use in-memory cache") - self.open_memory() - elif os.getenv("MGE_FASTRUN_CACHE_TYPE") == "FILE": - self.open_file() - else: - self.open_redis() - - def open_memory(self): - pass + cache_type = os.getenv("MGE_FASTRUN_CACHE_TYPE") + if cache_type not in ("FILE", "MEMORY"): + try: + redis_config = self.get_redis_config() + except Exception as exc: + get_logger().error( + "failed to connect to cache server {!r}; try fallback to " + "in-file cache".format(exc) + ) + else: + if redis_config is not None: + self.add_config( + "redis", + redis_config, + "fastrun use redis cache", + "failed to connect to cache server", + ) + if cache_type != "MEMORY": + path = self.get_cache_file(self.get_cache_dir()) + self.add_config( + "in-file", + {"path": path}, + "fastrun use in-file cache in {}".format(path), + "failed to create cache file in {}".format(path), + ) + self.add_config( + "in-memory", + {}, + "fastrun use in-memory cache", + "failed to create in-memory cache", + ) - def open_file(self): + def get_cache_dir(self): cache_dir = os.getenv("MGE_FASTRUN_CACHE_DIR") - try: - if not cache_dir: - from ..hub.hub import _get_megengine_home + if not cache_dir: + from ..hub.hub import _get_megengine_home - cache_dir = os.path.expanduser( - os.path.join(_get_megengine_home(), "persistent_cache.bin") - ) - os.makedirs(cache_dir, exist_ok=True) - cache_file = os.path.join(cache_dir, "cache") - with open(cache_file, "a"): - pass - assert self.try_open_file(cache_file), "cannot create file" - get_logger().info("fastrun use in-file cache in {}".format(cache_dir)) - except Exception as exc: - get_logger().error( - "failed to create cache file in {} {!r}; fallback to " - "in-memory cache".format(cache_dir, exc) + cache_dir = os.path.expanduser( + os.path.join(_get_megengine_home(), "persistent_cache") ) - self.open_memory() - - def open_redis(self): + os.makedirs(cache_dir, exist_ok=True) + return cache_dir + + def get_cache_file(self, cache_dir): + cache_file = os.path.join(cache_dir, "cache.bin") + with open(cache_file, "a"): + pass + return cache_file + + @contextlib.contextmanager + def lock_cache_file(self, cache_dir): + lock_file = os.path.join(cache_dir, "cache.lock") + with filelock.FileLock(lock_file): + yield + + def get_redis_config(self): + url = os.getenv("MGE_FASTRUN_CACHE_URL") + if url is None: + return None + assert sys.platform != "win32", "redis cache on windows not tested" prefix = "mgbcache:{}:MGB{}:GIT:{}".format( getpass.getuser(), __version__, git_version ) - url = os.getenv("MGE_FASTRUN_CACHE_URL") - if url is None: - self.open_file() - try: - assert sys.platform != "win32", "redis cache on windows not tested" - parse_result = urllib.parse.urlparse(url, scheme="redis") - assert parse_result.scheme == "redis", "unsupported scheme" - assert not parse_result.username, "redis conn with username unsupported" - assert self.try_open_redis( - parse_result.hostname, parse_result.port, parse_result.password, prefix - ), "connect failed" - except Exception as exc: - get_logger().error( - "failed to connect to cache server {!r}; try fallback to " - "in-file cache".format(exc) - ) - self.open_file() - - -_manager = None - + parse_result = urllib.parse.urlparse(url) + assert not parse_result.username, "redis conn with username unsupported" + if parse_result.scheme == "redis": + assert parse_result.hostname and parse_result.port, "invalid url" + assert not parse_result.path + config = { + "hostname": parse_result.hostname, + "port": str(parse_result.port), + } + elif parse_result.scheme == "redis+socket": + assert not (parse_result.hostname or parse_result.port) + assert parse_result.path + config = { + "unixsocket": parse_result.path, + } + else: + assert False, "unsupported scheme" + if parse_result.password is not None: + config["password"] = parse_result.password + config["prefix"] = prefix + return config -def get_manager(): - global _manager - if _manager is None: - _manager = PersistentCacheManager() - return _manager + def flush(self): + if self.config is not None and self.config.type == "in-file": + with self.lock_cache_file(self.get_cache_dir()): + super().flush() def _clean(): - nr_del = get_manager().clean() + nr_del = PersistentCacheOnServer().clean() if nr_del is not None: print("{} cache entries deleted".format(nr_del)) diff --git a/imperative/python/requires-test.txt b/imperative/python/requires-test.txt index 05643464..7b33b1ce 100644 --- a/imperative/python/requires-test.txt +++ b/imperative/python/requires-test.txt @@ -2,3 +2,4 @@ pytest==5.3.0 pytest-sphinx==0.3.1 tensorboardX==2.4 six==1.16.0 +redislite ; platform_system == "Linux" or platform_system == "Darwin" diff --git a/imperative/python/requires.txt b/imperative/python/requires.txt index 58a806c0..894b332a 100644 --- a/imperative/python/requires.txt +++ b/imperative/python/requires.txt @@ -4,8 +4,8 @@ pyarrow requests tabulate tqdm -redispy deprecated mprop wheel -megfile>=0.0.10 \ No newline at end of file +megfile>=0.0.10 +filelock diff --git a/imperative/python/src/ops.cpp b/imperative/python/src/ops.cpp index 30f61a0f..be184cff 100644 --- a/imperative/python/src/ops.cpp +++ b/imperative/python/src/ops.cpp @@ -567,7 +567,15 @@ void init_ops(py::module m) { rng::delete_handle(handle); }, py::call_guard()); - m.def("set_global_rng_seed", &rng::set_global_rng_seed); + m.def("set_global_rng_seed", [](uint64_t seed) -> void { + mgb_assert( + python::interpreter_for_py->check_available(), + "set global random seed failed since imperative interpreter has been " + "destroyed"); + python::interpreter_for_py->sync(); + mgb::CompNode::sync_all(); + rng::set_global_rng_seed(seed); + }); m.def("get_global_rng_seed", &rng::get_global_rng_seed); m.def("get_rng_handle_compnode", &rng::get_rng_handle_compnode); @@ -766,6 +774,13 @@ void init_custom(pybind11::module m) { m.def("_install", &install_custom); m.def("_uninstall", &uninstall_custom); m.def("_get_custom_op_list", &get_custom_op_list); + m.def("get_custom_op_abi_tag", [](void) -> int { + int ret = 0; +#ifdef _GLIBCXX_USE_CXX11_ABI + ret = _GLIBCXX_USE_CXX11_ABI; +#endif + return ret; + }); static PyMethodDef method_def = { #ifdef METH_FASTCALL diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index 87f2459d..f415f572 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -1074,6 +1074,10 @@ void init_tensor(py::module m) { []() { interpreter_for_py->sync(); CompNode::sync_all(); + CompNode::foreach ([](CompNode cn) { + auto err = cn.check_async_error(); + mgb_assert(!err, "%s", err->what()); + }); sync_py_task_q(); }, py::call_guard()); diff --git a/imperative/python/src/utils.cpp b/imperative/python/src/utils.cpp index 5162f488..260c7aa2 100644 --- a/imperative/python/src/utils.cpp +++ b/imperative/python/src/utils.cpp @@ -210,7 +210,7 @@ void init_utils(py::module m) { .def("disable", [](TensorSanityCheck& checker) { checker.disable(); }); #if MGB_ENABLE_OPR_MM - m.def("create_mm_server", &create_zmqrpc_server, py::arg("addr"), + m.def("create_mm_server", &mgb::opr::create_zmqrpc_server, py::arg("addr"), py::arg("port") = 0); #else m.def("create_mm_server", []() {}); @@ -234,51 +234,108 @@ void init_utils(py::module m) { using ExtendedPersistentCache = mgb::imperative::persistent_cache::ExtendedPersistentCache; - struct PersistentCacheManager { - std::shared_ptr instance; + struct ConfigurablePersistentCache : mgb::PersistentCache { + struct Config { + std::string type; + std::unordered_map args; + std::string on_success; + std::string on_fail; + }; - bool try_reg(std::shared_ptr cache) { - if (cache) { - instance = cache; - PersistentCache::set_impl(cache); - return true; - } - return false; - } - bool open_redis( - std::string ip, size_t port, std::string password, std::string prefix) { - return try_reg(mgb::imperative::persistent_cache::make_redis( - ip, port, password, prefix)); + std::shared_ptr impl; + std::optional impl_config; + std::vector configs; + + void add_config( + std::string type, std::unordered_map args, + std::string on_success, std::string on_fail) { + configs.push_back({type, args, on_success, on_fail}); } - bool open_file(std::string path) { - return try_reg(mgb::imperative::persistent_cache::make_in_file(path)); + + std::optional clean() { return get_impl()->clear(); } + + void load_config() { + std::optional err_msg; + for (size_t i = 0; i < configs.size(); ++i) { + auto& config = configs[i]; + if (err_msg) { + mgb_log_warn("try fallback to %s cache", config.type.c_str()); + } else { + err_msg.emplace(); + } + auto cache = ExtendedPersistentCache::make_from_config( + config.type, config.args, *err_msg); + if (!cache) { + mgb_log_warn("%s %s", config.on_fail.c_str(), err_msg->c_str()); + } else { + impl = cache; + impl_config = config; + break; + } + } + mgb_assert(impl_config.has_value(), "not valid config"); } - std::optional clean() { - if (instance) { - return instance->clear(); + + std::shared_ptr get_impl() { + if (!impl) { + load_config(); } - return {}; + return impl; } - void put(std::string category, std::string key, std::string value) { - PersistentCache::inst().put( - category, {key.data(), key.size()}, {value.data(), value.size()}); + + virtual mgb::Maybe get(const std::string& category, const Blob& key) { + return get_impl()->get(category, key); + } + + virtual void put( + const std::string& category, const Blob& key, const Blob& value) { + return get_impl()->put(category, key, value); } - py::object get(std::string category, std::string key) { - auto value = - PersistentCache::inst().get(category, {key.data(), key.size()}); + + virtual bool support_dump_cache() { return get_impl()->support_dump_cache(); } + + py::object py_get(std::string category, std::string key) { + auto value = get_impl()->get(category, {key.data(), key.size()}); if (value.valid()) { return py::bytes(std::string((const char*)value->ptr, value->size)); } else { return py::none(); } } + + void py_put(std::string category, std::string key, std::string value) { + get_impl()->put( + category, {key.data(), key.size()}, {value.data(), value.size()}); + } + + void flush() { + if (impl) { + impl->flush(); + } + } }; - py::class_(m, "PersistentCacheManager") - .def(py::init<>()) - .def("try_open_redis", &PersistentCacheManager::open_redis) - .def("try_open_file", &PersistentCacheManager::open_file) - .def("clean", &PersistentCacheManager::clean) - .def("put", &PersistentCacheManager::put) - .def("get", &PersistentCacheManager::get); + auto PyConfigurablePersistentCache = + py::class_< + ConfigurablePersistentCache, + std::shared_ptr>(m, "PersistentCache") + .def(py::init<>()) + .def("add_config", &ConfigurablePersistentCache::add_config) + .def("reg", + [](std::shared_ptr inst) { + PersistentCache::set_impl(inst); + }) + .def("clean", &ConfigurablePersistentCache::clean) + .def("get", &ConfigurablePersistentCache::py_get) + .def("put", &ConfigurablePersistentCache::py_put) + .def_readonly("config", &ConfigurablePersistentCache::impl_config) + .def("flush", &ConfigurablePersistentCache::flush); + + py::class_( + PyConfigurablePersistentCache, "Config") + .def_readwrite("type", &ConfigurablePersistentCache::Config::type) + .def_readwrite("args", &ConfigurablePersistentCache::Config::args) + .def_readwrite("on_fail", &ConfigurablePersistentCache::Config::on_fail) + .def_readwrite( + "on_success", &ConfigurablePersistentCache::Config::on_success); } diff --git a/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp b/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp new file mode 100644 index 00000000..d8f0299d --- /dev/null +++ b/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp @@ -0,0 +1,140 @@ +/** + * \file imperative/python/test/unit/core/custom_opsrc/elem_add.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megbrain/custom/custom.h" + +CUSTOM_OP_REG_BEGIN(ElemAddSmooth) + +void forward_device_infer( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + outputs[0] = inputs[0]; +} + +void forward_shape_infer( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + outputs[0] = inputs[0]; +} + +void forward_dtype_infer( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + outputs[0] = inputs[0]; +} + +void forward_format_infer( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + outputs[0] = inputs[0]; +} + +template +void forward_kernel( + const scalar_t* input0, const scalar_t* input1, scalar_t* output, size_t len, + float smooth) { + for (size_t i = 0; i < len; ++i) { + output[i] = input0[i] + input1[i]; + if (output[i] < 0) + output[i] += smooth; + else + output[i] -= smooth; + } +} + +void forward_compute( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + DISPATCH_SIGN_INT_AND_FLOAT_TYPES( + outputs[0].dtype(), "forward_compute", ([&]() { + forward_kernel( + inputs[0].data(), inputs[1].data(), + outputs[0].data(), outputs[0].size(), + params["smooth"].as()); + })); +} + +CUSTOM_OP_REG(ElemAddSmoothForward) + .set_description( + "Custom ElemAdd Operator With a Smooth Parameter, " + "which is used to verify the CPU kernel") + .add_input("lhs") + .add_input("rhs") + .add_output("output") + .add_param("smooth", 0.f) + .set_device_infer(forward_device_infer) + .set_shape_infer(forward_shape_infer) + .set_dtype_infer(forward_dtype_infer) + .set_format_infer(forward_format_infer) + .set_compute(forward_compute); + +void backward_device_infer( + const std::vector& ograds, const Param& params, + std::vector& igrads) { + igrads[0] = ograds[0]; + igrads[1] = ograds[0]; +} + +void backward_shape_infer( + const std::vector& ograds, const Param& params, + std::vector& igrads) { + igrads[0] = ograds[0]; + igrads[1] = ograds[0]; +} + +void backward_dtype_infer( + const std::vector& ograds, const Param& params, + std::vector& igrads) { + igrads[0] = ograds[0]; + igrads[1] = ograds[0]; +} + +void backward_format_infer( + const std::vector& ograds, const Param& params, + std::vector& igrads) { + igrads[0] = ograds[0]; + igrads[1] = ograds[0]; +} + +template +void backward_kernel( + const scalar_t* ograd, scalar_t* igrad0, scalar_t* igrad1, size_t len) { + for (size_t i = 0; i < len; ++i) { + igrad0[i] = ograd[i]; + igrad1[i] = ograd[i]; + } +} + +void backward_compute( + const std::vector& ograds, const Param& params, + std::vector& igrads) { + DISPATCH_SIGN_INT_AND_FLOAT_TYPES( + igrads[0].dtype(), "backward_compute", ([&]() { + backward_kernel( + ograds[0].data(), igrads[0].data(), + igrads[1].data(), igrads[0].size()); + })); +} + +CUSTOM_OP_REG(ElemAddSmoothBackward) + .set_description( + "Custom ElemAdd Operator With a Smooth Parameter, " + "which is used to verify the CPU kernel") + .add_input("ograd") + .add_output("igrad_lhs") + .add_output("igrad_rhs") + .set_device_infer(backward_device_infer) + .set_shape_infer(backward_shape_infer) + .set_dtype_infer(backward_dtype_infer) + .set_format_infer(backward_format_infer) + .set_compute(backward_compute); + +CUSTOM_OP_REG_END(ElemAddSmooth) diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp new file mode 100644 index 00000000..31998dd9 --- /dev/null +++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp @@ -0,0 +1,65 @@ +/** + * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./matmul_scale.h" +#include "megbrain/custom/custom.h" + +CUSTOM_OP_REG_BEGIN(MatMulScale) + +void forward_shape_infer( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + outputs[0] = {inputs[0][0], inputs[1][1]}; +} + +void forward_compute( + const std::vector& inputs, const Param& params, + std::vector& outputs) { + matmul_forward_helper( + inputs[0], inputs[1], outputs[0], inputs[0].shape()[0], + inputs[0].shape()[1], inputs[1].shape()[1], params["scale"].as()); +} + +CUSTOM_OP_REG(MatMulScaleForward) + .add_inputs(2) + .add_outputs(1) + .add_param("scale", 1.0f) + .set_shape_infer(forward_shape_infer) + .set_compute("cuda", forward_compute); + +void backward_shape_infer( + const std::vector& ograd_and_inputs, const Param& params, + std::vector& outputs) { + outputs[0] = ograd_and_inputs[1]; + outputs[1] = ograd_and_inputs[2]; +} + +void backward_compute( + const std::vector& ograd_and_inputs, const Param& params, + std::vector& igrads) { + matmul_backward_lhs_helper( + ograd_and_inputs[2], ograd_and_inputs[0], igrads[0], + ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1], + ograd_and_inputs[2].shape()[1], params["scale"].as()); + matmul_backward_rhs_helper( + ograd_and_inputs[1], ograd_and_inputs[0], igrads[1], + ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1], + ograd_and_inputs[2].shape()[1], params["scale"].as()); +} + +CUSTOM_OP_REG(MatMulScaleBackward) + .add_inputs(3) + .add_outputs(2) + .add_param("scale", 1.0f) + .set_shape_infer(backward_shape_infer) + .set_compute("cuda", backward_compute); + +CUSTOM_OP_REG_END(MatMulScale) diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu new file mode 100644 index 00000000..9d847d32 --- /dev/null +++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu @@ -0,0 +1,97 @@ +/** + * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include +#include +#include "./matmul_scale.h" + +using namespace custom; + +// matmul_forward for Mat_mxk * Mat_k*n +template +__global__ void matmul_forward_naive( + const T* lhs, const T* rhs, T* res, size_t M, size_t K, size_t N, float scale) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + T acc = 0; + for (int i = 0; i < K; ++i) + acc += lhs[row * K + i] * rhs[i * N + col]; + res[row * N + col] = acc * scale; +} + +// matmul_backward_lhs for Mat_mxk * Mat_k*n = Mat_mxn +// that is Mat_mxn * Mat_nxk +template +__global__ void matmul_backward_lhs_naive( + const T* rhs, const T* ograd, T* lhs_grad, size_t M, size_t K, size_t N, + float scale) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + T acc = 0; + for (int i = 0; i < N; ++i) + acc += ograd[row * N + i] * rhs[col * N + i]; + lhs_grad[row * K + col] = acc / scale; +} + +// matmul_backward_rhs for Mat_mxk * Mat_k*n = Mat_mxn +// that is Mat_kxm * Mat_mxn +template +__global__ void matmul_backward_rhs_naive( + const T* lhs, const T* ograd, T* rhs_grad, size_t M, size_t K, size_t N, + float scale) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + T acc = 0; + for (int i = 0; i < M; ++i) + acc += lhs[i * K + row] * ograd[i * N + col]; + rhs_grad[row * N + col] = acc / scale; +} + +void matmul_forward_helper( + const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N, + float scale) { + dim3 block(1, 1); + dim3 grid(N / block.x, M / block.y); + + DISPATCH_INT_AND_FLOAT_TYPES(res.dtype(), "matmul_forward", ([&]() { + matmul_forward_naive<<>>( + lhs.data(), rhs.data(), + res.data(), M, K, N, scale); + })); +} + +void matmul_backward_lhs_helper( + const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K, + size_t N, float scale) { + dim3 block(1, 1); + dim3 grid(K / block.x, M / block.y); + DISPATCH_INT_AND_FLOAT_TYPES( + lhs_grad.dtype(), "matmul_backward_lhs", ([&]() { + matmul_backward_lhs_naive<<>>( + rhs.data(), ograd.data(), + lhs_grad.data(), M, K, N, scale); + })); +} + +void matmul_backward_rhs_helper( + const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K, + size_t N, float scale) { + dim3 block(1, 1); + dim3 grid(N / block.x, K / block.y); + DISPATCH_INT_AND_FLOAT_TYPES( + rhs_grad.dtype(), "matmul_backward_rhs", ([&]() { + matmul_backward_rhs_naive<<>>( + lhs.data(), ograd.data(), + rhs_grad.data(), M, K, N, scale); + })); +} diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h new file mode 100644 index 00000000..5f7ea8d0 --- /dev/null +++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h @@ -0,0 +1,24 @@ +/** + * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megbrain/custom/custom.h" + +using Tensor = custom::Tensor; + +void matmul_forward_helper( + const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N, + float scale); +void matmul_backward_lhs_helper( + const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K, + size_t N, float scale); +void matmul_backward_rhs_helper( + const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K, + size_t N, float scale); diff --git a/imperative/python/test/unit/core/test_custom_op.py b/imperative/python/test/unit/core/test_custom_op.py new file mode 100644 index 00000000..e2a9e4b2 --- /dev/null +++ b/imperative/python/test/unit/core/test_custom_op.py @@ -0,0 +1,111 @@ +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import os +import platform +import shutil +import sys + +import numpy as np +import pytest + +import megengine +import megengine.functional as F +import megengine.optimizer as optim +from megengine import jit +from megengine.autodiff import Function, GradManager +from megengine.core._imperative_rt.core2 import apply +from megengine.core.ops import custom +from megengine.device import get_device_count +from megengine.module import Conv2d, Linear, Module +from megengine.random import normal +from megengine.tensor import Parameter, Tensor +from megengine.utils import custom_op_tools + + +def compare(ref, real): + if ref.shape != real.shape: + real = real.T + np.testing.assert_allclose(ref, real, rtol=1e-3, atol=1e-5) + + +def build_and_clean(test_func): + def wrapper(): + cur_dir_path = os.path.dirname(os.path.abspath(__file__)) + build_path = os.path.join(cur_dir_path, "custom_opsrc", "build") + mgb_root_path = os.path.dirname( + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(cur_dir_path))) + ) + ) + extra_include_paths = [os.path.join(mgb_root_path, "src", "custom", "include")] + extra_ld_flags = [] + + if sys.platform != "win32": + ld_path = os.environ.get("LD_LIBRARY_PATH") + if ld_path != None: + ld_dirs = ld_path.split(":") + for ld_dir in ld_dirs: + if os.path.exists(ld_dir) and os.path.isdir(ld_dir): + for lib in os.listdir(ld_dir): + if "megengine_shared" in lib: + extra_ld_flags += [ + "-L{} -Wl,-rpath,{}".format(ld_dir, ld_dir) + ] + break + + if get_device_count("gpu") > 0: + custom_opsrc = [ + os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"), + os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"), + ] + else: + custom_opsrc = [os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp")] + + lib_path = custom_op_tools.build_and_load( + "test_op", + custom_opsrc, + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ld_flags, + build_dir=build_path, + verbose=False, + abi_tag=custom.get_custom_op_abi_tag(), + ) + test_func() + + custom.unload(lib_path) + if os.path.exists(build_path): + shutil.rmtree(build_path) + + return wrapper + + +@pytest.mark.skipif( + get_device_count("gpu") > 0, reason="elem_add operator is only supported on CPU" +) +@build_and_clean +def test_custom_op_cpu_build(): + assert "ElemAddSmoothForward" in custom._get_custom_op_list() + assert "ElemAddSmoothBackward" in custom._get_custom_op_list() + assert hasattr(custom, "ElemAddSmoothForward") + assert hasattr(custom, "ElemAddSmoothBackward") + + +@pytest.mark.skipif( + platform.system() == "Darwin", + reason="GPU kernel is only support on Linux and Windows", +) +@pytest.mark.skipif( + get_device_count("gpu") < 1, reason="matmul scale operator is only supported on GPU" +) +@build_and_clean +def test_custom_op_gpu_build(): + assert "MatMulScaleForward" in custom._get_custom_op_list() + assert "MatMulScaleBackward" in custom._get_custom_op_list() + assert hasattr(custom, "MatMulScaleForward") + assert hasattr(custom, "MatMulScaleBackward") diff --git a/imperative/python/test/unit/core/test_interpreter.py b/imperative/python/test/unit/core/test_interpreter.py index 3bbb5caf..2513c36b 100644 --- a/imperative/python/test/unit/core/test_interpreter.py +++ b/imperative/python/test/unit/core/test_interpreter.py @@ -96,6 +96,15 @@ def test_regression_2870(): (x + x).numpy() +@pytest.mark.require_ngpu(1) +def test_async_error_check(): + src = mge.tensor([[1.0, 2.0]]) + index = mge.tensor([3]) + val = F.indexing_one_hot(src, index) + with pytest.raises(RuntimeError): + val.numpy() + + # NOTE: DO NOT REMOVE THIS TEST # This is also a compatibility test for # mge.core.set_option('async_level', 0). diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py index 1a82b30d..3176c057 100644 --- a/imperative/python/test/unit/functional/test_functional.py +++ b/imperative/python/test/unit/functional/test_functional.py @@ -59,14 +59,47 @@ def test_where(): def test_dropout(): - # test training mode - data = tensor(np.ones(10000000, dtype=np.float32)) - out = F.nn.dropout(data, 1.0 / 3.0, training=True) - assert not out.numpy().all() - - # test eval mode - out = F.nn.dropout(data, 1.0 / 3.0, training=False) - assert out.numpy().all() + from megengine.autodiff import GradManager + from megengine.core._imperative_rt.ops import set_global_rng_seed + + def test_dropout_with_shape(shape, rate): + data = tensor(np.ones(shape, dtype=np.float32)) + gm = GradManager().attach([data]) + with gm: + out = F.nn.dropout(data, rate, training=True) + gm.backward(out, tensor(np.ones(shape, dtype=np.float32))) + assert not out.numpy().all() + np.testing.assert_allclose(out.numpy(), data.grad.numpy(), 1e-7, 1e-7) + + def test_multiple_dropout(shape, rate): + data = tensor(np.ones(shape, dtype=np.float32)) + gm = GradManager().attach([data]) + with gm: + out1 = F.nn.dropout(data, rate, training=True) + out2 = F.nn.dropout(out1, rate, training=True) + out3 = F.nn.dropout(out2, rate, training=True) + gm.backward(out3, tensor(np.ones(shape, dtype=np.float32))) + np.testing.assert_allclose(out3.numpy(), data.grad.numpy(), 1e-7, 1e-7) + + def test_dropout_seed(shape, rate): + data = tensor(np.random.randn(*shape), dtype="float32") + set_global_rng_seed(111) + out1 = F.nn.dropout(data, rate, training=True) + out2 = F.nn.dropout(data, rate, training=True) + assert not (out1.numpy() == out2.numpy()).all() + + set_global_rng_seed(111) + out3 = F.nn.dropout(data, rate, training=True) + assert (out1.numpy() == out3.numpy()).all() + + set_global_rng_seed(222) + out4 = F.nn.dropout(data, rate, training=True) + assert not (out1.numpy() == out4.numpy()).all() + + test_dropout_with_shape([13, 17, 63, 21], 0.4) + test_dropout_with_shape([16, 32, 64], 0.3) + test_multiple_dropout([1024], 0.2) + test_dropout_seed([16, 32], 0.2) def test_matinv(): @@ -865,61 +898,6 @@ def test_conv1d(): ) -def test_layer_norm(): - def _layer_norm(x, normalized_shape, affine, weight=None, bias=None, eps=1e-5): - __layer_norm = LayerNorm(normalized_shape=normalized_shape, affine=affine) - __layer_norm.weight = weight - __layer_norm.bias = bias - return __layer_norm(x) - - def _layer_norm_numpy( - x, normalized_shape, affine, weight=None, bias=None, eps=1e-5 - ): - x_shape = x.shape - dim_delta = len(x_shape) - len(normalized_shape) - non_flatten_shape = x_shape[:dim_delta] - x = x.reshape(*non_flatten_shape, -1) - - mean = x.mean(axis=-1, keepdims=True) - var = (x ** 2).mean(axis=-1, keepdims=True) - mean * mean - - x = (x - mean) / F.sqrt(var + eps) - x = x.reshape(x_shape) - if affine: - x = weight * x + bias - - return x - - normalized_shape = (28, 28) - inp_feat = Tensor(np.random.randn(32, 64, 28, 28), dtype="float32") - weight = Tensor(np.random.randn(28, 28), dtype="float32") - bias = Tensor(np.random.randn(28, 28), dtype="float32") - - inp_feat = inp_feat + 1 - weight = weight + 1 - bias = bias - - affine = False - - outvar = F.nn.layer_norm(inp_feat, normalized_shape, affine, weight, bias) - targetvar = _layer_norm_numpy(inp_feat, normalized_shape, affine, weight, bias) - - assert abs(outvar - targetvar).mean() < 1e-7 - - # no random, affine True - normalized_shape = (28, 28) - inp_feat = Tensor(np.ones((32, 64, 28, 28)), dtype="float32") - weight = Tensor(np.ones((28, 28)), dtype="float32") - bias = Tensor(np.zeros((28, 28)), dtype="float32") - - affine = True - - outvar = F.nn.layer_norm(inp_feat, normalized_shape, affine, weight, bias) - targetvar = _layer_norm(inp_feat, normalized_shape, affine, weight, bias) - assert abs((outvar - targetvar).mean()) < 1e-7 - assert abs(outvar.mean()) < 1e-7 - - def test_batchnorm2d_autocast(): """check amp's result is equal to manually converted result""" amp.enabled = True diff --git a/imperative/python/test/unit/functional/test_loss.py b/imperative/python/test/unit/functional/test_loss.py index d46f40b6..abf4b2fe 100644 --- a/imperative/python/test/unit/functional/test_loss.py +++ b/imperative/python/test/unit/functional/test_loss.py @@ -43,7 +43,7 @@ def test_cross_entropy(): x = softmax(x) l_ref = ref(x, y) l = F.nn.cross_entropy(tensor(x, "float32"), tensor(y, "int32"), with_logits=False) - np.testing.assert_allclose(l.numpy(), l_ref) + np.testing.assert_allclose(l.numpy(), l_ref, 1e-6, 1e-6) def test_cross_entropy_reduction(): diff --git a/imperative/python/test/unit/random/test_rng.py b/imperative/python/test/unit/random/test_rng.py index a33a5840..1083e947 100644 --- a/imperative/python/test/unit/random/test_rng.py +++ b/imperative/python/test/unit/random/test_rng.py @@ -226,7 +226,7 @@ def test_UniformRNG(): out2 = m2.uniform(size=(100,)) out3 = m3.uniform(size=(100,)) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() assert not (out1.numpy() == out1_.numpy()).all() @@ -254,7 +254,7 @@ def test_NormalRNG(): out2 = m2.normal(size=(100,)) out3 = m3.normal(size=(100,)) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() assert not (out1.numpy() == out1_.numpy()).all() @@ -283,7 +283,7 @@ def test_GammaRNG(): out2 = m2.gamma(2, size=(100,)) out3 = m3.gamma(2, size=(100,)) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() assert not (out1.numpy() == out1_.numpy()).all() @@ -316,7 +316,7 @@ def test_BetaRNG(): out2 = m2.beta(2, 1, size=(100,)) out3 = m3.beta(2, 1, size=(100,)) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() assert not (out1.numpy() == out1_.numpy()).all() @@ -351,7 +351,7 @@ def test_PoissonRNG(): out2 = m2.poisson(lam.to("xpu1"), size=(100,)) out3 = m3.poisson(lam.to("xpu0"), size=(100,)) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() @@ -381,7 +381,7 @@ def test_PermutationRNG(symbolic): out2 = m2.permutation(1000) out3 = m3.permutation(1000) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() assert not (out1.numpy() == out1_.numpy()).all() @@ -443,7 +443,7 @@ def test_ShuffleRNG(): m2.shuffle(out2) m3.shuffle(out3) - np.testing.assert_equal(out1.numpy(), out2.numpy()) + np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() @@ -465,7 +465,7 @@ def test_seed(): set_global_seed(10) out3 = uniform(size=[10, 10]) - np.testing.assert_equal(out1.numpy(), out3.numpy()) + np.testing.assert_allclose(out1.numpy(), out3.numpy(), atol=1e-6) set_global_seed(11) out4 = uniform(size=[10, 10]) diff --git a/imperative/python/test/unit/traced_module/test_modification.py b/imperative/python/test/unit/traced_module/test_modification.py index 1a9c99f9..036924a7 100644 --- a/imperative/python/test/unit/traced_module/test_modification.py +++ b/imperative/python/test/unit/traced_module/test_modification.py @@ -377,6 +377,33 @@ def test_set_node_name(): rename("output") np.testing.assert_equal(str(graph.outputs[0]), "output") + def add_1(x): + x = x + 1 + x.name = "func_add_1" + return x + + class ModuleAdd_3(M.Module): + def forward(self, x): + x = x + 1 + x.name = "module_add_1" + x = x + 2 + return x + + setattr(traced_module, "add_3", ModuleAdd_3()) + + self = graph.inputs[0] + with graph.insert_exprs(): + x = output_node + 1 + x.name = "_add_1" + x = add_1(x) + x = self.add_3(x) + graph.replace_node({output_node: x}) + graph.compile() + + assert "_add_1" in graph._namespace.used_names + assert "func_add_1" in graph._namespace.used_names + assert "module_add_1" in traced_module.add_3.graph._namespace.used_names + def test_set_graph_name(): traced_module, x, expect = _init_module() diff --git a/imperative/python/test/unit/traced_module/test_qat_module.py b/imperative/python/test/unit/traced_module/test_qat_module.py index 6ef8764b..57a94693 100644 --- a/imperative/python/test/unit/traced_module/test_qat_module.py +++ b/imperative/python/test/unit/traced_module/test_qat_module.py @@ -109,6 +109,7 @@ def build_observered_net(net: M.Module, observer_cls): ) Q.enable_observer(qat_net) inp = Tensor(np.random.random(size=(5, 3, 32, 32))) + qat_net.eval() qat_net(inp) Q.disable_observer(qat_net) return qat_net @@ -116,6 +117,7 @@ def build_observered_net(net: M.Module, observer_cls): def build_fakequanted_net(net: QATModule, fakequant_cls): qat_net = Q.reset_qconfig(net, get_lsq_config(fakequant_cls)) + qat_net.eval() return qat_net @@ -162,6 +164,7 @@ def test_load_param(): def _check_module(build_func: Callable): net = build_func() + net.eval() buffer = io.BytesIO() mge.save(net.state_dict(), buffer) buffer.seek(0) @@ -185,6 +188,7 @@ def test_load_param(): def test_qualname(): def _check_qualname(net): inp = Tensor(np.random.random(size=(5, 3, 32, 32))) + net.eval() traced_net = trace_module(net, inp) base_qualname = traced_net.graph.qualname for node in traced_net.graph.nodes(): diff --git a/imperative/python/test/unit/traced_module/test_trace_module.py b/imperative/python/test/unit/traced_module/test_trace_module.py index e4441c49..d3baf153 100644 --- a/imperative/python/test/unit/traced_module/test_trace_module.py +++ b/imperative/python/test/unit/traced_module/test_trace_module.py @@ -6,7 +6,7 @@ import megengine.functional as F import megengine.module as M from megengine import Tensor from megengine.module.module import Module -from megengine.traced_module import TracedModule, trace_module +from megengine.traced_module import TracedModule, enable_expr_checker, trace_module from megengine.traced_module.expr import CallFunction @@ -58,7 +58,7 @@ class MyModule4(M.Module): def test_trace_module(): - + enable_expr_checker() x = Tensor(1) m1 = MyModule1() tm1 = trace_module(m1, x) diff --git a/imperative/python/test/unit/utils/test_utils.py b/imperative/python/test/unit/utils/test_utils.py index 0ca81072..f32a3d93 100644 --- a/imperative/python/test/unit/utils/test_utils.py +++ b/imperative/python/test/unit/utils/test_utils.py @@ -1,15 +1,69 @@ +import os +import platform + import pytest -from megengine.utils.persistent_cache import _manager +from megengine.utils.persistent_cache import PersistentCacheOnServer + + +@pytest.mark.parametrize("with_flag", [True, False]) +@pytest.mark.skipif( + platform.system() not in {"Linux", "Darwin"}, + reason="redislite not implemented in windows", +) +def test_persistent_cache_redis(monkeypatch, with_flag): + import redislite + + server = redislite.Redis() + monkeypatch.delenv("MGE_FASTRUN_CACHE_TYPE", raising=False) + monkeypatch.setenv( + "MGE_FASTRUN_CACHE_URL", "redis+socket://{}".format(server.socket_file) + ) + if with_flag: + server.set("mgb-cache-flag", 1) + pc = PersistentCacheOnServer() + pc.put("test", "hello", "world") + if with_flag: + pc = PersistentCacheOnServer() + assert pc.get("test", "hello") == b"world" + assert pc.config.type == "redis" + else: + assert pc.config.type == "in-file" + + +def test_persistent_cache_file(monkeypatch, tmp_path): + monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "FILE") + monkeypatch.setenv("MGE_FASTRUN_CACHE_DIR", tmp_path) + pc = PersistentCacheOnServer() + pc.put("test", "store", "this") + assert pc.config.type == "in-file" + del pc + pc = PersistentCacheOnServer() + assert pc.get("test", "store") == b"this" + + +def test_persistent_cache_file_clear(monkeypatch, tmp_path): + monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "FILE") + monkeypatch.setenv("MGE_FASTRUN_CACHE_DIR", tmp_path) + pc = PersistentCacheOnServer() + pc_dummy = PersistentCacheOnServer() + pc.put("test", "drop", "this") + assert pc.config.type == "in-file" + del pc + # this dummy instance shouldn't override cache file + del pc_dummy + os.unlink(os.path.join(tmp_path, "cache.bin")) + pc = PersistentCacheOnServer() + assert pc.get("test", "drop") is None -def test_persistent_cache(): - pc = _manager - k0 = b"\x00\x00" - k1 = b"\x00\x01" - cat = "test" - pc.put(cat, k0, k1) - pc.put(cat, k1, k0) - assert k1 == pc.get(cat, k0) - assert k0 == pc.get(cat, k1) - assert pc.get("test1", k0) == None +def test_persistent_cache_memory(monkeypatch): + monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "MEMORY") + pc = PersistentCacheOnServer() + assert pc.config is None + pc.put("test", "drop", "this") + assert pc.config.type == "in-memory" + assert pc.get("test", "drop") == b"this" + del pc + pc = PersistentCacheOnServer() + assert pc.get("test", "drop") is None diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index bf122982..ecf63e65 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -156,6 +156,8 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) { if (m_async_level == 0) { sync_impl(); info->desc.comp_node.sync(); + auto err = info->desc.comp_node.check_async_error(); + mgb_assert(!err, "%s", err->what()); } return info; } @@ -336,6 +338,8 @@ void ChannelImpl::dispatch_kernel( for (auto&& oup : *outputs) { auto info = reinterpret_cast(oup); info->ptr->comp_node().sync(); + auto err = info->ptr->comp_node().check_async_error(); + mgb_assert(!err, "%s", err->what()); } } } @@ -931,7 +935,8 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { MGB_RECORD_EVENT(TensorWaitPropEvent, info->id, m_waitee_id, prop); bool require_host = prop == TensorProp::HostValue; auto host_available = [&] { return info->ptr && info->ptr->value_fetched(); }; - if (require_host && !host_available()) { + bool wait_host = !host_available(); + if (require_host && wait_host) { // avoid dead lock lock.unlock(); m_buffer.enqueue(GetValue{info}); @@ -944,6 +949,10 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { }); MGB_RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); m_waitee = nullptr; + if (require_host && wait_host) { + auto err = info->ptr->comp_node().check_async_error(); + mgb_assert(!err, "%s", err->what()); + } return info->ptr; } diff --git a/imperative/src/impl/ops/collective_comm.cpp b/imperative/src/impl/ops/collective_comm.cpp index 6c969a22..141f7f62 100644 --- a/imperative/src/impl/ops/collective_comm.cpp +++ b/imperative/src/impl/ops/collective_comm.cpp @@ -27,7 +27,7 @@ namespace imperative { namespace { cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto&& comm = def.cast_final_safe(); - auto group_client = std::make_shared( + auto group_client = std::make_shared( ssprintf("%s:%d", comm.addr.data(), comm.port)); SmallVector> dev_buffer_arr(1, nullptr); auto disable = std::make_shared(); diff --git a/imperative/src/impl/ops/elemwise.cpp b/imperative/src/impl/ops/elemwise.cpp index 232e85ef..809b53ca 100644 --- a/imperative/src/impl/ops/elemwise.cpp +++ b/imperative/src/impl/ops/elemwise.cpp @@ -158,70 +158,71 @@ SmallVector apply_on_physical_tensor( MGB_DEFINE_OPR_CLASS( ForceInplaceElemwise, - cg::SingleCNOperatorNodeBaseT) //{ + cg::SingleCNOperatorNodeBaseT) // { public: -struct Param { - using Mode = megdnn::Elemwise::Param::Mode; - Mode mode; - size_t inplace_index; -}; -using Mode = Param::Mode; -ForceInplaceElemwise( - const VarNodeArray& inputs, Param param, OperatorNodeConfig config = {}) - : Super(inputs[0]->owner_graph(), config, "device_add_update", inputs), - m_param{param} { - for (auto* input : inputs) { - add_input({input}); + struct Param { + using Mode = megdnn::Elemwise::Param::Mode; + Mode mode; + size_t inplace_index; + }; + using Mode = Param::Mode; + ForceInplaceElemwise( + const VarNodeArray& inputs, Param param, OperatorNodeConfig config = {}) + : Super(inputs[0]->owner_graph(), config, "device_add_update", inputs), + m_param{param} { + for (auto* input : inputs) { + add_input({input}); + } + add_output(None) + ->set_fwd_in2out_writable_force(input(param.inplace_index)) + .add_flag(VarNode::Flag::NO_MEM_RECLAIM); } - add_output(None) - ->set_fwd_in2out_writable_force(input(param.inplace_index)) - .add_flag(VarNode::Flag::NO_MEM_RECLAIM); -} -static SymbolVar make(const VarNodeArray& inputs, Param param) { - return SymbolVar{inputs[0]}.insert_single_output_opr( - inputs, param); -} -static cg::OperatorNodeBase* shallow_copy( - const serialization::OprShallowCopyContext& ctx, - const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs, - const OperatorNodeConfig& config); + static SymbolVar make(const VarNodeArray& inputs, Param param) { + return SymbolVar{inputs[0]}.insert_single_output_opr( + inputs, param); + } + static cg::OperatorNodeBase* shallow_copy( + const serialization::OprShallowCopyContext& ctx, + const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs, + const OperatorNodeConfig& config); protected: -NodeProp* do_make_node_prop() const override { - auto ret = Super::do_make_node_prop(); - ret->add_flag(NodeProp::Flag::FORCE_UPDATE_INPUT_VAR); - return ret; -} -void create_megdnn_opr() override { - auto opr = DnnOprCaller::create_operator(comp_node()); - opr->param().mode = m_param.mode; - set_megdnn_opr(std::move(opr)); -} -void scn_do_execute() override { - auto to_dnnnd = [&](auto* var) { return var->dev_tensor().as_megdnn(); }; - megdnn::TensorNDArray inputs_dnnnd; - for (auto* input : input()) { - inputs_dnnnd.push_back(to_dnnnd(input)); + NodeProp* do_make_node_prop() const override { + auto ret = Super::do_make_node_prop(); + ret->add_flag(NodeProp::Flag::FORCE_UPDATE_INPUT_VAR); + return ret; } - mgb_assert( - input(m_param.inplace_index)->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC), - "ForceInplaceElemwise cannot be applied in internal tensor"); - auto* out_dest = output(0); - auto* opr = static_cast(megdnn_opr()); - opr->exec(std::move(inputs_dnnnd), to_dnnnd(out_dest)); -} -void init_output_static_infer_desc() override { - using namespace cg::static_infer; + void create_megdnn_opr() override { + auto opr = DnnOprCaller::create_operator(comp_node()); + opr->param().mode = m_param.mode; + set_megdnn_opr(std::move(opr)); + } + void scn_do_execute() override { + auto to_dnnnd = [&](auto* var) { return var->dev_tensor().as_megdnn(); }; + megdnn::TensorNDArray inputs_dnnnd; + for (auto* input : input()) { + inputs_dnnnd.push_back(to_dnnnd(input)); + } + mgb_assert( + input(m_param.inplace_index) + ->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC), + "ForceInplaceElemwise cannot be applied in internal tensor"); + auto* out_dest = output(0); + auto* opr = static_cast(megdnn_opr()); + opr->exec(std::move(inputs_dnnnd), to_dnnnd(out_dest)); + } + void init_output_static_infer_desc() override { + using namespace cg::static_infer; - owner_graph()->static_infer_manager().register_shape_infer( - output(0), ShapeInferDesc::make_identity(input(m_param.inplace_index))); -} + owner_graph()->static_infer_manager().register_shape_infer( + output(0), ShapeInferDesc::make_identity(input(m_param.inplace_index))); + } private: -Param m_param; -void record_execute_deps(ExecDependencyArray& deps) override { - record_megdnn_opr(deps); -} + Param m_param; + void record_execute_deps(ExecDependencyArray& deps) override { + record_megdnn_opr(deps); + } }; MGB_DYN_TYPE_OBJ_FINAL_IMPL(ForceInplaceElemwise); diff --git a/imperative/src/impl/ops/io_remote.cpp b/imperative/src/impl/ops/io_remote.cpp index 29b0316e..03e4d58a 100644 --- a/imperative/src/impl/ops/io_remote.cpp +++ b/imperative/src/impl/ops/io_remote.cpp @@ -28,7 +28,7 @@ namespace { cg::OperatorNodeBase* apply_on_var_node_remote_send( const OpDef& def, const VarNodeArray& inputs) { auto&& send = def.cast_final_safe(); - auto group_client = std::make_shared( + auto group_client = std::make_shared( ssprintf("%s:%d", send.addr.data(), send.port)); auto&& graph = inputs[0]->owner_graph(); @@ -44,7 +44,7 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( auto&& recv = def.cast_final_safe(); OperatorNodeConfig config{recv.cn}; config.name(recv.make_name()); - auto group_client = std::make_shared( + auto group_client = std::make_shared( ssprintf("%s:%d", recv.addr.data(), recv.port)); auto&& graph = inputs[0]->owner_graph(); return graph->insert_opr(std::make_unique( diff --git a/imperative/src/impl/ops/rng.cpp b/imperative/src/impl/ops/rng.cpp index 232629f8..311a780a 100644 --- a/imperative/src/impl/ops/rng.cpp +++ b/imperative/src/impl/ops/rng.cpp @@ -282,6 +282,21 @@ struct OpMeth { } }; +template <> +struct OpMeth { + using DnnOp = megdnn::Dropout; + using Param = DnnOp::Param; + using OpNode = mgb::opr::Dropout; + static Param make_param(const Dropout& opdef) { + auto handle_seed = RNGDnnOpManager::get_seed(opdef.handle); + mgb_assert( + handle_seed == opdef.seed, + "inconsistent dropout seed: dropout op: %lu handle: %lu", handle_seed, + opdef.seed); + return {opdef.drop_prob, handle_seed}; + } +}; + template struct _InferLayout; @@ -482,6 +497,26 @@ SmallVector infer_output_attrs( return dests; } +template <> +SmallVector infer_output_attrs( + const OpDef& op, const SmallVector& inputs) { + SmallVector dests(2); + auto&& cn = inputs[0]->comp_node(); + + dests[0].comp_node = cn; + dests[0].layout = TensorLayout(inputs[0]->layout()); + dests[0].layout.dtype = inputs[0]->layout().dtype; + + auto get_mask_size = [&]() -> size_t { + auto dnn_handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(cn)).handle(); + return dnn_handle->create_operator()->get_mask_size_in_bytes( + inputs[0]->layout()); + }; + dests[1].comp_node = cn; + dests[1].layout = TensorLayout(TensorShape({get_mask_size()}), dtype::Byte()); + return dests; +} + template std::tuple, SmallVector> infer_output_mem_desc( const OpDef& def, const SmallVector& inputs_tensors, @@ -559,6 +594,25 @@ std::tuple, bool> infer_output_attrs_fallible< return {dests, true}; } +template <> +std::tuple, bool> infer_output_attrs_fallible( + const OpDef& op, const SmallVector& inputs) { + SmallVector dests(2); + auto cn = inputs[0].comp_node; + dests[0].comp_node = cn; + dests[0].layout = TensorLayout(inputs[0].layout); + dests[0].layout.dtype = inputs[0].layout.dtype; + + auto get_mask_size = [&]() -> size_t { + auto dnn_handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(cn)).handle(); + return dnn_handle->create_operator()->get_mask_size_in_bytes( + inputs[0].layout); + }; + dests[1].comp_node = cn; + dests[1].layout = TensorLayout(TensorShape({get_mask_size()}), dtype::Byte()); + return {dests, true}; +} + } // anonymous namespace Handle new_handle(CompNode comp_node, uint64_t seed) { @@ -599,6 +653,7 @@ REG_RNG_OP(PermutationRNG, SymbolVar) REG_RNG_OP(PoissonRNG, SymbolVar) REG_RNG_OP(BetaRNG, SymbolVar) REG_RNG_OP(ShuffleRNG, SymbolVarArray) +REG_RNG_OP(Dropout, SymbolVarArray) #undef REG_RNG_OP } // namespace mgb::imperative::rng diff --git a/imperative/src/impl/ops/specializations.cpp b/imperative/src/impl/ops/specializations.cpp index d153b429..5d3562a2 100644 --- a/imperative/src/impl/ops/specializations.cpp +++ b/imperative/src/impl/ops/specializations.cpp @@ -20,6 +20,7 @@ #include "megbrain/opr/dnn/correlation.h" #include "megbrain/opr/dnn/fake_quant.h" #include "megbrain/opr/dnn/images2neibs.h" +#include "megbrain/opr/dnn/layer_norm.h" #include "megbrain/opr/dnn/local.h" #include "megbrain/opr/dnn/lrn.h" #include "megbrain/opr/dnn/lsq.h" @@ -636,4 +637,29 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { } OP_TRAIT_REG(LRN, LRN).apply_on_var_node(apply_on_var_node).fallback(); } // namespace lrn + +namespace layer_norm { + +cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { + auto&& op = static_cast(def); + size_t nr_inp = inputs.size(); + auto p = op.param(); + mgb_assert((nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine)); + OperatorNodeConfig config{op.make_name()}; + if (nr_inp == 3) { + return opr::LayerNorm::make( + inputs[0], inputs[1], inputs[2], op.param(), config)[0] + .node() + ->owner_opr(); + } else { + return opr::LayerNorm::make(inputs[0], op.param(), config)[0] + .node() + ->owner_opr(); + } +} + +OP_TRAIT_REG(LayerNorm, LayerNorm).apply_on_var_node(apply_on_var_node).fallback(); + +} // namespace layer_norm + } // namespace mgb::imperative diff --git a/imperative/src/impl/persistent_cache.cpp b/imperative/src/impl/persistent_cache.cpp index ba3809ab..5fc291c8 100644 --- a/imperative/src/impl/persistent_cache.cpp +++ b/imperative/src/impl/persistent_cache.cpp @@ -27,8 +27,10 @@ public: m_local = std::make_shared(); } - bool connect(std::string ip, size_t port, std::string password) { - m_client.auth(password); + void connect(std::string ip, size_t port, std::optional password) { + if (password) { + m_client.auth(*password); + } m_client.connect( ip, port, [](const std::string& host, std::size_t port, @@ -40,16 +42,32 @@ public: } }, std::uint32_t(200)); - if (!m_client.is_connected()) { - return false; - } + mgb_assert(m_client.is_connected(), "connect failed"); auto flag = m_client.get("mgb-cache-flag"); sync(); - return flag.get().ok(); + auto is_valid = [](const cpp_redis::reply& reply) { + switch (reply.get_type()) { + case cpp_redis::reply::type::error: + case cpp_redis::reply::type::null: + return false; + case cpp_redis::reply::type::integer: + return reply.as_integer() != 0; + case cpp_redis::reply::type::simple_string: + case cpp_redis::reply::type::bulk_string: + return !reply.as_string().empty(); + case cpp_redis::reply::type::array: + return !reply.as_array().empty(); + default: + mgb_assert(false, "unknown reply type %d", (int)reply.get_type()); + } + }; + mgb_assert(is_valid(flag.get()), "invalid mgb-cache-flag"); } bool valid() const override { return m_client.is_connected(); } + void flush() override {} + mgb::Maybe get(const std::string& category, const Blob& key) override { MGB_LOCK_GUARD(m_mtx); auto mem_result = m_local->get(category, key); @@ -75,7 +93,7 @@ public: MGB_LOCK_GUARD(m_mtx); std::string key_str(static_cast(key.ptr), key.size); std::string redis_key_str; - encode(category + '@' + key_str, redis_key_str); + encode(category + '@' + key_str, redis_key_str, 24); std::string value_str(static_cast(value.ptr), value.size); std::string redis_value_str; encode(value_str, redis_value_str); @@ -118,18 +136,16 @@ private: class ExtendedInFilePersistentCache final : public ExtendedPersistentCache { private: - std::string m_path; + std::optional m_path; std::unique_ptr m_impl; public: ExtendedInFilePersistentCache() = default; - bool open(std::string path) { + void open(std::string path) { std::fstream file; file.open(path, std::ios::in | std::ios::binary); - if (!file.is_open()) { - return false; - } + mgb_assert(file.is_open(), "can't open file in %s", path.c_str()); std::vector bytes = { std::istreambuf_iterator(file), std::istreambuf_iterator()}; if (bytes.size()) { @@ -139,14 +155,11 @@ public: m_impl = std::make_unique(); } m_path = path; - return true; } - ~ExtendedInFilePersistentCache() { - if (m_impl) { - m_impl->dump_cache(m_path.c_str()); - } - } + void open() { m_impl = std::make_unique(); } + + ~ExtendedInFilePersistentCache() { flush(); } mgb::Maybe get(const std::string& category, const Blob& key) override { return m_impl->get(category, key); @@ -157,29 +170,64 @@ public: } std::optional clear() override { - m_impl = std::make_unique(); - m_impl->dump_cache(m_path.c_str()); + if (m_impl) { + m_impl = std::make_unique(); + if (m_path) { + m_impl->dump_cache(m_path->c_str()); + } + } return {}; } bool valid() const override { return m_impl != nullptr; } -}; -std::shared_ptr make_redis( - std::string ip, size_t port, std::string password, std::string prefix) { - auto cache = std::make_shared(prefix, 100); - if (!cache->connect(ip, port, password)) { - return nullptr; + void flush() override { + if (m_impl && m_path) { + m_impl->dump_cache(m_path->c_str()); + } } - return cache; -} +}; -std::shared_ptr make_in_file(std::string path) { - auto cache = std::make_shared(); - if (!cache->open(path)) { - return nullptr; +std::shared_ptr ExtendedPersistentCache::make_from_config( + std::string type, std::unordered_map args, + std::string& err_msg) { + try { + if (type == "redis") { + std::string prefix = args.at("prefix"); + std::optional password = args.count("password") + ? args.at("password") + : std::optional(); + auto cache = std::make_shared(prefix, 100); + if (args.count("unixsocket")) { + std::string unixsocket = args.at("unixsocket"); + cache->connect(unixsocket, 0, password); + } else { + std::string ip = args.at("hostname"); + int port = atoi(args.at("port").c_str()); + std::optional password = + args.count("password") ? args.at("password") + : std::optional(); + cache->connect(ip, port, password); + } + return cache; + } else if (type == "in-file") { + std::string path = args.at("path"); + auto cache = std::make_shared(); + cache->open(path); + return cache; + } else if (type == "in-memory") { + auto cache = std::make_shared(); + cache->open(); + return cache; + } else { + mgb_assert(false, "persistent cache type %s unsupported", type.c_str()); + } + } catch (const std::exception& exc) { + err_msg = exc.what(); + } catch (...) { + err_msg = "unknown exception"; } - return cache; + return nullptr; } } // namespace mgb::imperative::persistent_cache diff --git a/imperative/src/include/megbrain/imperative/persistent_cache.h b/imperative/src/include/megbrain/imperative/persistent_cache.h index 59326bbd..4d63eaae 100644 --- a/imperative/src/include/megbrain/imperative/persistent_cache.h +++ b/imperative/src/include/megbrain/imperative/persistent_cache.h @@ -20,12 +20,12 @@ class ExtendedPersistentCache : public mgb::PersistentCache { public: virtual bool valid() const = 0; virtual std::optional clear() = 0; -}; - -std::shared_ptr make_redis( - std::string ip, size_t port, std::string password, std::string prefix); + virtual void flush() = 0; -std::shared_ptr make_in_file(std::string path); + static std::shared_ptr make_from_config( + std::string type, std::unordered_map args, + std::string& err_msg); +}; } // namespace mgb::imperative::persistent_cache // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/imperative/src/test/collective_comm.cpp b/imperative/src/test/collective_comm.cpp index 01c0829d..4a31c54b 100644 --- a/imperative/src/test/collective_comm.cpp +++ b/imperative/src/test/collective_comm.cpp @@ -20,7 +20,7 @@ TEST(TestImperative, AllReduceBasic) { REQUIRE_GPU(2); const char* server_addr = "127.0.0.1"; uint32_t port = 3456; - mgb_assert(create_zmqrpc_server(server_addr, port) > 0); + mgb_assert(opr::create_zmqrpc_server(server_addr, port) > 0); HostTensorGenerator<> gen; CompNode cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"); diff --git a/imperative/src/test/io_remote.cpp b/imperative/src/test/io_remote.cpp index 8e32f7ab..97a7b62d 100644 --- a/imperative/src/test/io_remote.cpp +++ b/imperative/src/test/io_remote.cpp @@ -20,7 +20,7 @@ TEST(TestImperative, IORemote) { REQUIRE_GPU(2); const char* server_addr = "127.0.0.1"; uint32_t port = 4567; - mgb_assert(create_zmqrpc_server(server_addr, port) > 0); + mgb_assert(opr::create_zmqrpc_server(server_addr, port) > 0); HostTensorGenerator<> gen; CompNode cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"); diff --git a/imperative/tablegen/CMakeLists.txt b/imperative/tablegen/CMakeLists.txt index 7b4a1802..f2d3ed76 100644 --- a/imperative/tablegen/CMakeLists.txt +++ b/imperative/tablegen/CMakeLists.txt @@ -1,6 +1,7 @@ # mgb tablegen executable set(TABLE_TARGET mgb-mlir-autogen) -file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.h ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) add_executable(${TABLE_TARGET} ${SRCS}) target_include_directories(${TABLE_TARGET} PRIVATE ${MLIR_LLVM_INCLUDE_DIR}) target_link_libraries(${TABLE_TARGET} PRIVATE LLVMTableGen MLIRTableGen LLVMSupport) @@ -13,5 +14,8 @@ tablegen(MGB opdef.cpp.inl ${MGE_IR_INCLUDE_DIRS} "--gen-cpp-body") tablegen(MGB opdef.py.inl ${MGE_IR_INCLUDE_DIRS} "--gen-python-binding") tablegen(MGB opdef.cpy.inl ${MGE_IR_INCLUDE_DIRS} "--gen-python-c-extension") tablegen(MGB enum_macro.h ${MGE_IR_INCLUDE_DIRS} "--gen-enum-list-macro") -add_custom_target(mgb_opdef ALL DEPENDS opdef.h.inl opdef.cpp.inl opdef.py.inl opdef.cpy.inl enum_macro.h param_defs_tblgen) -set(MGB_OPDEF_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE) +add_custom_target(mgb_opdef ALL DEPENDS opdef.h.inl opdef.cpp.inl opdef.py.inl + opdef.cpy.inl enum_macro.h param_defs_tblgen) +set(MGB_OPDEF_OUT_DIR + ${CMAKE_CURRENT_BINARY_DIR} + PARENT_SCOPE) diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt index debaa5c9..68fc59f1 100644 --- a/imperative/test/CMakeLists.txt +++ b/imperative/test/CMakeLists.txt @@ -5,46 +5,60 @@ file(GLOB_RECURSE SOURCES ../src/test/*.cpp ../src/impl/*.cpp ${MGB_TEST_DIR}/*. # disable distributed tests if(NOT MGE_WITH_DISTRIBUTED) - list(FILTER SOURCES EXCLUDE REGEX ".*test/collective_comm.cpp") - list(FILTER SOURCES EXCLUDE REGEX ".*test/io_remote.cpp") + list(FILTER SOURCES EXCLUDE REGEX ".*test/collective_comm.cpp") + list(FILTER SOURCES EXCLUDE REGEX ".*test/io_remote.cpp") endif() # TODO: turn python binding into a static/object library add_executable(imperative_test ${SOURCES} ${SRCS}) add_dependencies(imperative_test mgb_opdef) -target_include_directories(imperative_test PRIVATE ${MGB_TEST_DIR}/include ../src/include ${MGB_OPDEF_OUT_DIR} ${CPP_REDIS_INCLUDES}) +target_include_directories( + imperative_test PRIVATE ${MGB_TEST_DIR}/include ../src/include ${MGB_OPDEF_OUT_DIR} + ${CPP_REDIS_INCLUDES}) # Python binding -target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR}) +target_include_directories( + imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHON_INCLUDE_DIRS} + ${NUMPY_INCLUDE_DIR}) target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) -set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json) +set(LINK_LIBS + megbrain + megdnn + ${MGE_CUDA_LIBS} + gtest + gmock + pybind11::embed + range-v3 + nlohmann_json::nlohmann_json) if(MGE_WITH_CUDA) - list(APPEND LINK_LIBS cudart) + list(APPEND LINK_LIBS cudart) endif() if(MGE_WITH_DISTRIBUTED) - list(APPEND LINK_LIBS megray) + list(APPEND LINK_LIBS megray) endif() target_link_libraries(imperative_test ${LINK_LIBS}) if(CXX_SUPPORT_WCLASS_MEMACCESS) - if(MGE_WITH_CUDA) - target_compile_options(imperative_test PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" - "$<$>:-Wno-class-memaccess>") - else() - target_compile_options(imperative_test PRIVATE "-Wno-class-memaccess") - endif() + if(MGE_WITH_CUDA) + target_compile_options( + imperative_test + PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" + "$<$>:-Wno-class-memaccess>") + else() + target_compile_options(imperative_test PRIVATE "-Wno-class-memaccess") + endif() endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(imperative_test dl) - else() - target_link_libraries(imperative_test dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(imperative_test dl) + else() + target_link_libraries(imperative_test dl rt) + endif() endif() install(TARGETS imperative_test RUNTIME DESTINATION test) diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index b0966ea8..69b54eb3 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -8,155 +8,185 @@ set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC}) if(NOT MGB_WITH_FLATBUFFERS) - include(../cmake/flatbuffers.cmake) + include(../cmake/flatbuffers.cmake) endif() file(GLOB_RECURSE SRC_FBS src/**/*.fbs) build_flatbuffers( - "${SRC_FBS}" - "" - lite_fbs_generate - "" - "${CMAKE_CURRENT_BINARY_DIR}" - "" - "" - ) + "${SRC_FBS}" + "" + lite_fbs_generate + "" + "${CMAKE_CURRENT_BINARY_DIR}" + "" + "") file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp) if(MGE_WITH_MINIMUM_SIZE) - set(LITE_ENABLE_LOGGING OFF) - set(LITE_ENABLE_EXCEPTION OFF) + set(LITE_ENABLE_LOGGING OFF) + set(LITE_ENABLE_EXCEPTION OFF) endif() -# Write out lite_build_config.h -# It defines macros needed by lite -configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include) +# Write out lite_build_config.h It defines macros needed by lite +configure_file(src/lite_build_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include) # begin config lite -if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) - # FXIME third_party cpp redis do not support build with clang-cl - list(APPEND SOURCES_LITE ${CPP_REDIS_SRCS}) +if(LITE_BUILD_WITH_MGE + AND LITE_WITH_CUDA + AND NOT WIN32) + # FXIME third_party cpp redis do not support build with clang-cl + list(APPEND SOURCES_LITE ${CPP_REDIS_SRCS}) endif() add_library(lite_static STATIC ${SOURCES_LITE}) add_dependencies(lite_static lite_fbs_generate) include_directories($) if(LITE_BUILD_WITH_MGE) - target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) - add_compile_definitions(LITE_BUILD_WITH_MGE=1) - message(STATUS "build lite with MegEngine.") + target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) + add_compile_definitions(LITE_BUILD_WITH_MGE=1) + message(STATUS "build lite with MegEngine.") else() - target_link_libraries(lite_static PUBLIC flatbuffers) + target_link_libraries(lite_static PUBLIC flatbuffers) endif() include_directories( - PUBLIC $ - PUBLIC $ - PUBLIC $ - PUBLIC $ - PUBLIC $ - PUBLIC $ - ) + PUBLIC + $ + PUBLIC + $ + PUBLIC + $ + PUBLIC + $ + PUBLIC + $ + PUBLIC + $) # end config lite # define a shared lib add_library(lite_shared SHARED $) if(LITE_BUILD_WITH_MGE) - target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) + target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) endif() if(ANDROID) - link_libraries(log) - target_link_libraries(lite_static PRIVATE log) - target_link_libraries(lite_shared PRIVATE log) + link_libraries(log) + target_link_libraries(lite_static PRIVATE log) + target_link_libraries(lite_shared PRIVATE log) endif() # define a shared lib for whl add_library(lite_shared_whl SHARED $) if(LITE_BUILD_WITH_MGE) - if (IOS) - target_link_libraries(lite_shared_whl PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) - else() - target_link_libraries(lite_shared_whl PRIVATE megengine_shared) - endif() + if(IOS) + target_link_libraries(lite_shared_whl PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) + else() + target_link_libraries(lite_shared_whl PRIVATE megengine_shared) + endif() endif() if(ANDROID) - target_link_libraries(lite_shared_whl PRIVATE log) + target_link_libraries(lite_shared_whl PRIVATE log) endif() -# add lite_static_all_in_one same name build by BUILD -# please do not change flatbuffers/cpuinfo/clog/lite_static order, if change!, cmake -# can not gen flatbuffers/cpuinfo/clog OBJs to lite_static_all_in_one, this may cmake issue -# NOTICE: this target always use to separate build with lite, if build lite via include +# add lite_static_all_in_one same name build by BUILD please do not change +# flatbuffers/cpuinfo/clog/lite_static order, if change!, cmake can not gen +# flatbuffers/cpuinfo/clog OBJs to lite_static_all_in_one, this may cmake issue NOTICE: +# this target always use to separate build with lite, if build lite via include # MegEngine/megbrain ROOT_DIR/CMakeLists.txt, just depends lite_static or lite_shared -#TODO: need refine lite_static_all_in_one depend objects, but now cmake do not support +# TODO: need refine lite_static_all_in_one depend objects, but now cmake do not support # define a add_library which OBJECTS args is a set or list or string -if (MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL) - add_library(lite_static_all_in_one STATIC $ $ $ $ $) -elseif (MGE_ENABLE_CPUINFO AND NOT MGE_WITH_OPENCL) - add_library(lite_static_all_in_one STATIC $ $ $ $) -elseif (NOT MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL) - add_library(lite_static_all_in_one STATIC $ $ $) +if(MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL) + add_library( + lite_static_all_in_one STATIC + $ $ $ + $ $) +elseif(MGE_ENABLE_CPUINFO AND NOT MGE_WITH_OPENCL) + add_library( + lite_static_all_in_one STATIC + $ $ $ + $) +elseif(NOT MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL) + add_library( + lite_static_all_in_one STATIC + $ $ + $) else() - add_library(lite_static_all_in_one STATIC $ $) + add_library(lite_static_all_in_one STATIC $ + $) endif() if(LITE_BUILD_WITH_MGE) - target_link_libraries(lite_static_all_in_one PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) + target_link_libraries(lite_static_all_in_one PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) endif() -if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) - # FXIME third_party cpp redis do not support build with clang-cl - target_include_directories(lite_static PRIVATE ${CPP_REDIS_INCLUDES}) - target_include_directories(lite_shared PRIVATE ${CPP_REDIS_INCLUDES}) - target_include_directories(lite_shared_whl PRIVATE ${CPP_REDIS_INCLUDES}) - target_include_directories(lite_static_all_in_one PRIVATE ${CPP_REDIS_INCLUDES}) +if(LITE_BUILD_WITH_MGE + AND LITE_WITH_CUDA + AND NOT WIN32) + # FXIME third_party cpp redis do not support build with clang-cl + target_include_directories(lite_static PRIVATE ${CPP_REDIS_INCLUDES}) + target_include_directories(lite_shared PRIVATE ${CPP_REDIS_INCLUDES}) + target_include_directories(lite_shared_whl PRIVATE ${CPP_REDIS_INCLUDES}) + target_include_directories(lite_static_all_in_one PRIVATE ${CPP_REDIS_INCLUDES}) endif() -set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script") +set(LITE_VERSION_SCRIPT + ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld + CACHE INTERNAL "Path to linker version script") add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT}) if(NOT MSVC AND NOT WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") endif() -#TODO: implemente version script for other OS -if (UNIX AND NOT APPLE) - target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) - set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) - target_link_options(lite_shared_whl PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) - set_target_properties(lite_shared_whl PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) +# TODO: implemente version script for other OS +if(UNIX AND NOT APPLE) + target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) + set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) + target_link_options(lite_shared_whl PRIVATE + -Wl,--version-script=${LITE_VERSION_SCRIPT}) + set_target_properties(lite_shared_whl PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) endif() # config install -install(TARGETS lite_static - LIBRARY DESTINATION lite/lib/${MGE_ARCH} - FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} - ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) - -install(TARGETS lite_shared - LIBRARY DESTINATION lite/lib/${MGE_ARCH} - FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} - ARCHIVE DESTINATION lite/lib/${MGE_ARCH} - ) - -install(TARGETS lite_static_all_in_one - LIBRARY DESTINATION lite/lib/${MGE_ARCH} - FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} - ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) +install( + TARGETS lite_static + LIBRARY DESTINATION lite/lib/${MGE_ARCH} + FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} + ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) + +install( + TARGETS lite_shared + LIBRARY DESTINATION lite/lib/${MGE_ARCH} + FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} + ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) + +install( + TARGETS lite_static_all_in_one + LIBRARY DESTINATION lite/lib/${MGE_ARCH} + FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} + ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c) + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include - DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") +install( + DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite + FILES_MATCHING + PATTERN "*.h") -install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include - DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") +install( + DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include + DESTINATION ${CMAKE_INSTALL_PREFIX}/lite + FILES_MATCHING + PATTERN "*.h") add_subdirectory(example) if(MGE_WITH_TEST) - add_subdirectory(test) + add_subdirectory(test) endif() -#load_and_run +# load_and_run add_subdirectory(load_and_run) # tools and example @@ -164,11 +194,12 @@ add_executable(rc4_encryptor tools/rc4_encrypt.cpp) target_link_libraries(rc4_encryptor lite_static) if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(rc4_encryptor megdnn) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(rc4_encryptor megdnn) endif() -target_include_directories(rc4_encryptor PRIVATE - {PROJECT_SOURCE_DIR}/lite/src/decryption) -install (TARGETS rc4_encryptor - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/tools) +target_include_directories(rc4_encryptor + PRIVATE {PROJECT_SOURCE_DIR}/lite/src/decryption) +install( + TARGETS rc4_encryptor + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/tools) diff --git a/lite/example/c_example/CMakeLists.txt b/lite/example/c_example/CMakeLists.txt index 141725e5..b0a0d0bd 100644 --- a/lite/example/c_example/CMakeLists.txt +++ b/lite/example/c_example/CMakeLists.txt @@ -1,44 +1,46 @@ add_executable(lite_c_examples ./main.c) if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(lite_c_examples PRIVATE "-fuse-ld=gold") + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_c_examples PRIVATE "-fuse-ld=gold") endif() target_link_libraries(lite_c_examples lite_static) if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(lite_c_examples megdnn) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(lite_c_examples megdnn) endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(lite_c_examples dl) - else() - target_link_libraries(lite_c_examples dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(lite_c_examples dl) + else() + target_link_libraries(lite_c_examples dl rt) + endif() endif() -install (TARGETS lite_c_examples - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/bin) +install( + TARGETS lite_c_examples + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) # add lite_examples_depends_shared for CI check symbol export valid -add_executable(lite_c_examples_depends_shared ./main.c) +add_executable(lite_c_examples_depends_shared ./main.c) if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(lite_c_examples_depends_shared PRIVATE "-fuse-ld=gold") + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_c_examples_depends_shared PRIVATE "-fuse-ld=gold") endif() target_link_libraries(lite_c_examples_depends_shared lite_shared) if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(lite_c_examples_depends_shared dl) - else() - target_link_libraries(lite_c_examples_depends_shared dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(lite_c_examples_depends_shared dl) + else() + target_link_libraries(lite_c_examples_depends_shared dl rt) + endif() endif() -install (TARGETS lite_c_examples_depends_shared - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/bin) +install( + TARGETS lite_c_examples_depends_shared + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) diff --git a/lite/example/cpp_example/CMakeLists.txt b/lite/example/cpp_example/CMakeLists.txt index f7227b62..1649c4b1 100644 --- a/lite/example/cpp_example/CMakeLists.txt +++ b/lite/example/cpp_example/CMakeLists.txt @@ -1,49 +1,51 @@ -file (GLOB_RECURSE SOURCES ./*.cpp) -add_executable(lite_examples ${SOURCES}) +file(GLOB_RECURSE SOURCES ./*.cpp) +add_executable(lite_examples ${SOURCES}) target_include_directories(lite_examples PUBLIC ./) if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(lite_examples PRIVATE "-fuse-ld=gold") + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_examples PRIVATE "-fuse-ld=gold") endif() target_link_libraries(lite_examples lite_static) if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(lite_examples megdnn) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(lite_examples megdnn) endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(lite_examples dl) - else() - target_link_libraries(lite_examples dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(lite_examples dl) + else() + target_link_libraries(lite_examples dl rt) + endif() endif() -install (TARGETS lite_examples - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/bin) +install( + TARGETS lite_examples + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) # add lite_examples_depends_shared for CI check symbol export valid -add_executable(lite_examples_depends_shared ${SOURCES}) +add_executable(lite_examples_depends_shared ${SOURCES}) if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold") + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold") endif() target_link_libraries(lite_examples_depends_shared lite_shared) target_include_directories(lite_examples_depends_shared PUBLIC ./) if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(lite_examples_depends_shared dl) - else() - target_link_libraries(lite_examples_depends_shared dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(lite_examples_depends_shared dl) + else() + target_link_libraries(lite_examples_depends_shared dl rt) + endif() endif() -install (TARGETS lite_examples_depends_shared - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/bin) +install( + TARGETS lite_examples_depends_shared + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) diff --git a/lite/load_and_run/CMakeLists.txt b/lite/load_and_run/CMakeLists.txt index d7b5b9d9..6ef5279a 100644 --- a/lite/load_and_run/CMakeLists.txt +++ b/lite/load_and_run/CMakeLists.txt @@ -1,55 +1,62 @@ # BUILD the load and run for lite -include_directories(PUBLIC $) -file (GLOB_RECURSE SOURCES ./*.cpp) +include_directories(PUBLIC + $) +file(GLOB_RECURSE SOURCES ./*.cpp) -add_executable (load_and_run ${SOURCES}) +add_executable(load_and_run ${SOURCES}) target_link_libraries(load_and_run lite_static) target_link_libraries(load_and_run megbrain) target_link_libraries(load_and_run gflags) if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(load_and_run PRIVATE "-fuse-ld=gold") + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(load_and_run PRIVATE "-fuse-ld=gold") endif() if(MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(load_and_run megdnn) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(load_and_run megdnn) endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(load_and_run dl) - else() - target_link_libraries(load_and_run dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(load_and_run dl) + else() + target_link_libraries(load_and_run dl rt) + endif() endif() -install (TARGETS load_and_run EXPORT ${LITE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +install( + TARGETS load_and_run + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) if(BUILD_SHARED_LIBS) - add_executable(load_and_run_depends_shared ${SOURCES}) - target_link_libraries(load_and_run_depends_shared lite_shared) - target_link_libraries(load_and_run_depends_shared gflags) - target_link_libraries(load_and_run_depends_shared megengine) - - if(LITE_BUILD_WITH_RKNPU) - #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check - target_link_options(load_and_run_depends_shared PRIVATE "-fuse-ld=gold") - endif() + add_executable(load_and_run_depends_shared ${SOURCES}) + target_link_libraries(load_and_run_depends_shared lite_shared) + target_link_libraries(load_and_run_depends_shared gflags) + target_link_libraries(load_and_run_depends_shared megengine) - if(MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(load_and_run_depends_shared megdnn) - endif() + if(LITE_BUILD_WITH_RKNPU) + # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check + target_link_options(load_and_run_depends_shared PRIVATE "-fuse-ld=gold") + endif() + + if(MGE_WITH_ROCM) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(load_and_run_depends_shared megdnn) + endif() - if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(load_and_run_depends_shared dl) - else() - target_link_libraries(load_and_run_depends_shared dl rt) - endif() + if(UNIX) + if(APPLE OR ANDROID) + target_link_libraries(load_and_run_depends_shared dl) + else() + target_link_libraries(load_and_run_depends_shared dl rt) endif() + endif() - install(TARGETS load_and_run_depends_shared EXPORT ${MGE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install( + TARGETS load_and_run_depends_shared + EXPORT ${MGE_EXPORT_TARGETS} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() diff --git a/lite/load_and_run/src/helpers/data_parser.cpp b/lite/load_and_run/src/helpers/data_parser.cpp index 0ba71626..d5b7518e 100644 --- a/lite/load_and_run/src/helpers/data_parser.cpp +++ b/lite/load_and_run/src/helpers/data_parser.cpp @@ -30,7 +30,12 @@ void DataParser::feed(const std::string& path) { } auto endWith = [blob_string](std::string suffix) -> bool { - return blob_string.rfind(suffix) == (blob_string.length() - suffix.length()); + const auto index = blob_string.rfind(suffix); + if (index != std::string::npos and + index == blob_string.length() - suffix.length()) { + return true; + } + return false; }; if (endWith(".ppm") || endWith(".pgm")) { diff --git a/lite/pylite/megenginelite/__init__.py b/lite/pylite/megenginelite/__init__.py index 95c22633..ec14aeec 100644 --- a/lite/pylite/megenginelite/__init__.py +++ b/lite/pylite/megenginelite/__init__.py @@ -8,6 +8,7 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. from .base import * +from .base import version as __version__ from .global_setting import * from .network import * from .struct import * diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py index 1106079a..c8072791 100644 --- a/lite/pylite/megenginelite/network.py +++ b/lite/pylite/megenginelite/network.py @@ -69,7 +69,9 @@ class LiteOptions(Structure): "const_shape": bool(self.const_shape), "force_dynamic_alloc": bool(self.force_dynamic_alloc), "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), - "force_output_nocopy": bool(self.force_output_nocopy), + "force_output_use_user_specified_memory": bool( + self.force_output_use_user_specified_memory + ), "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), "jit_level": self.jit_level, "comp_node_seq_record_level": self.comp_node_seq_record_level, @@ -99,7 +101,7 @@ class LiteConfig(Structure): ("device_id", c_int), ("device_type", c_int), ("backend", c_int), - ("bare_model_cryption_name", c_char_p), + ("_bare_model_cryption_name", c_char_p), ("options", LiteOptions), ] @@ -110,18 +112,30 @@ class LiteConfig(Structure): else: self.options = LiteOptions() - self.bare_model_cryption_name = c_char_p(b"") + self._bare_model_cryption_name = c_char_p(b"") self.use_loader_dynamic_param = 0 self.has_compression = 0 self.backend = LiteBackend.LITE_DEFAULT + @property + def bare_model_cryption_name(self): + return self._bare_model_cryption_name.decode("utf-8") + + @bare_model_cryption_name.setter + def bare_model_cryption_name(self, name): + if isinstance(name, str): + self._bare_model_cryption_name = name.encode("utf-8") + else: + assert isinstance(name, bytes), "name should be str or bytes type." + self._bare_model_cryption_name = name + def __repr__(self): data = { "has_compression": bool(self.has_compression), "device_id": LiteDeviceType(self.device_id), "device_type": LiteDeviceType(self.device_type), "backend": LiteBackend(self.backend), - "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"), + "bare_model_cryption_name": self.bare_model_cryption_name, "options": self.options, } return data.__repr__() @@ -149,7 +163,7 @@ class LiteIO(Structure): """ _fields_ = [ - ("name", c_char_p), + ("_name", c_char_p), ("is_host", c_int), ("io_type", c_int), ("config_layout", LiteLayout), @@ -159,9 +173,9 @@ class LiteIO(Structure): self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None ): if type(name) == str: - self.name = c_char_p(name.encode("utf-8")) + self._name = c_char_p(name.encode("utf-8")) else: - self.name = c_char_p(name) + self._name = c_char_p(name) if layout: self.config_layout = layout @@ -171,6 +185,18 @@ class LiteIO(Structure): self.is_host = is_host self.io_type = io_type + @property + def name(self): + return self._name.decode("utf-8") + + @name.setter + def name(self, name): + if isinstance(name, str): + self._name = name.encode("utf-8") + else: + assert isinstance(name, bytes), "name should be str or bytes type." + self._name = name + def __repr__(self): data = { "name": self.name, @@ -208,17 +234,45 @@ class LiteNetworkIO(object): the input and output information for user to construct _LiteNetWorkIO """ - def __init__(self): + def __init__(self, inputs=None, outputs=None): self.inputs = [] self.outputs = [] + if inputs: + for i in inputs: + if isinstance(i, list): + self.inputs.append(LiteIO(*i)) + else: + assert isinstance( + i, LiteIO + ), "the param to construct LiteNetworkIO must be list of the LiteIO member or the LiteIO." + self.inputs.append(i) + if outputs: + for i in outputs: + if isinstance(i, list): + self.outputs.append(LiteIO(*i)) + else: + assert isinstance( + i, LiteIO + ), "the param to construct LiteNetworkIO must be list of the LiteIO member or the LiteIO." + self.outputs.append(i) + + def add_input( + self, obj, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None + ): + if isinstance(obj, LiteIO): + self.inputs.append(obj) + else: + name = obj + self.add_input(LiteIO(name, is_host, io_type, layout)) - def add_input(self, input_io): - assert isinstance(input_io, LiteIO) - self.inputs.append(input_io) - - def add_output(self, output_io): - assert isinstance(output_io, LiteIO) - self.outputs.append(output_io) + def add_output( + self, obj, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None + ): + if isinstance(obj, LiteIO): + self.outputs.append(obj) + else: + name = obj + self.add_output(LiteIO(name, is_host, io_type, layout)) def _create_network_io(self): network_io = _LiteNetworkIO() diff --git a/lite/pylite/megenginelite/tensor.py b/lite/pylite/megenginelite/tensor.py index ef86154f..188a486e 100644 --- a/lite/pylite/megenginelite/tensor.py +++ b/lite/pylite/megenginelite/tensor.py @@ -48,6 +48,15 @@ ctype_to_lite_dtypes = { c_ushort: LiteDataType.LITE_UINT16, } +_lite_dtypes_to_ctype = { + LiteDataType.LITE_INT: c_int, + LiteDataType.LITE_FLOAT: c_float, + LiteDataType.LITE_UINT8: c_ubyte, + LiteDataType.LITE_INT8: c_byte, + LiteDataType.LITE_INT16: c_short, + LiteDataType.LITE_UINT16: c_ushort, +} + class LiteLayout(Structure): """ @@ -55,7 +64,7 @@ class LiteLayout(Structure): """ _fields_ = [ - ("shapes", c_size_t * MAX_DIM), + ("_shapes", c_size_t * MAX_DIM), ("ndim", c_size_t), ("data_type", c_int), ] @@ -64,10 +73,10 @@ class LiteLayout(Structure): if shape: shape = list(shape) assert len(shape) <= MAX_DIM, "Layout max dim is 7." - self.shapes = (c_size_t * MAX_DIM)(*shape) + self._shapes = (c_size_t * MAX_DIM)(*shape) self.ndim = len(shape) else: - self.shapes = (c_size_t * MAX_DIM)() + self._shapes = (c_size_t * MAX_DIM)() self.ndim = 0 if not dtype: self.data_type = LiteDataType.LITE_FLOAT @@ -83,9 +92,24 @@ class LiteLayout(Structure): else: raise RuntimeError("unkonw data type") + @property + def dtype(self): + return _lite_type_to_nptypes[LiteDataType(self.data_type)] + + @property + def shapes(self): + return list(self._shapes)[0 : self.ndim] + + @shapes.setter + def shapes(self, shape): + shape = list(shape) + assert len(shape) <= MAX_DIM, "Layout max dim is 7." + self._shapes = (c_size_t * MAX_DIM)(*shape) + self.ndim = len(shape) + def __repr__(self): data = { - "shapes": list(self.shapes)[0 : self.ndim], + "shapes": self.shapes, "ndim": self.ndim, "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)], } @@ -177,15 +201,20 @@ class LiteTensor(object): device_type=LiteDeviceType.LITE_CPU, device_id=0, is_pinned_host=False, + shapes=None, + dtype=None, ): """ - create a Tensor with layout, device, is_pinned_host param + create a Tensor with layout, device, is_pinned_host or shapes, dtype, + device_type, device_id, is_pinned_host param """ self._tensor = _Ctensor() - if layout: + self._layout = LiteLayout() + if layout is not None: self._layout = layout - else: - self._layout = LiteLayout() + elif shapes is not None: + shapes = list(shapes) + self._layout = LiteLayout(shapes, dtype) self._device_type = device_type self._device_id = device_id self._is_pinned_host = is_pinned_host @@ -222,9 +251,12 @@ class LiteTensor(object): @layout.setter def layout(self, layout): - assert isinstance(layout, LiteLayout) - self._layout = layout - self._api.LITE_set_tensor_layout(self._tensor, layout) + if isinstance(layout, LiteLayout): + self._layout = layout + elif isinstance(layout, list): + self._layout.shapes = layout + + self._api.LITE_set_tensor_layout(self._tensor, self._layout) @property def is_pinned_host(self): @@ -270,7 +302,6 @@ class LiteTensor(object): """ get the length of the meomry in byte """ - self.update() length = c_size_t() self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length)) return length.value @@ -336,7 +367,6 @@ class LiteTensor(object): """ get the memory of the tensor, return c_void_p of the tensor memory """ - self.update() mem = c_void_p() self._api.LITE_get_tensor_memory(self._tensor, byref(mem)) return mem @@ -347,7 +377,6 @@ class LiteTensor(object): param data: the data will shared to the tensor, it should be a numpy.ndarray or ctypes data """ - self.update() if isinstance(data, np.ndarray): assert ( self.is_continue @@ -356,8 +385,7 @@ class LiteTensor(object): self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU ), "set_data_by_share can only apply in cpu tensor or pinned tensor." - np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] - c_type = np.ctypeslib.as_ctypes_type(np_type) + c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)] if self.nbytes != data.nbytes: self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type]) @@ -377,7 +405,6 @@ class LiteTensor(object): param data: the data to copy to tensor, it should be list, numpy.ndarraya or ctypes with length """ - self.update() if layout is not None: self.layout = layout @@ -386,8 +413,7 @@ class LiteTensor(object): self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU ), "set_data_by_copy can only apply in cpu tensor or pinned tensor." - np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] - c_type = np.ctypeslib.as_ctypes_type(np_type) + c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)] tensor_memory = c_void_p() @@ -415,6 +441,22 @@ class LiteTensor(object): self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) memmove(tensor_memory, data, data_length) + def get_data_by_share(self): + """ + get the data in the tensor, add share the data with a new numpy, and + return the numpy arrray, be careful, the data in numpy is valid before + the tensor memory is write again, such as LiteNetwok forward next time. + """ + assert self.is_continue, "get_data_by_share can only apply in continue tensor." + assert ( + self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU + ), "get_data_by_share can only apply in CPU tensor or cpu pinned tensor." + + memory = self.get_ctypes_memory() + c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)] + pnt = cast(memory, POINTER(c_type)) + return np.ctypeslib.as_array(pnt, self._layout.shapes) + def to_numpy(self): """ get the buffer of the tensor @@ -475,3 +517,13 @@ def LiteTensorConcat( ) result_tensor.update() return result_tensor + + +def lite_dtype_2_numpy(dtype): + """ + convert lite dtype to corresponding numpy dtype + """ + assert isinstance( + dtype, LiteDataType + ), "input must be LiteDataType when using lite_dtype_2_numpy." + return _lite_type_to_nptypes[dtype] diff --git a/lite/pylite/test/test_network.py b/lite/pylite/test/test_network.py index 6bb8c979..70d4aecf 100644 --- a/lite/pylite/test/test_network.py +++ b/lite/pylite/test/test_network.py @@ -21,6 +21,12 @@ def test_version(): print("Lite verson: {}".format(version)) +def test_config(): + config = LiteConfig() + config.bare_model_cryption_name = "nothing" + print(config) + + def test_network_io(): input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE) input_io2 = LiteIO( @@ -32,6 +38,7 @@ def test_network_io(): io = LiteNetworkIO() io.add_input(input_io1) io.add_input(input_io2) + io.add_input("data3", False) output_io1 = LiteIO("out1", is_host=False) output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000])) @@ -39,7 +46,7 @@ def test_network_io(): io.add_output(output_io1) io.add_output(output_io2) - assert len(io.inputs) == 2 + assert len(io.inputs) == 3 assert len(io.outputs) == 2 assert io.inputs[0] == input_io1 @@ -47,9 +54,25 @@ def test_network_io(): c_io = io._create_network_io() - assert c_io.input_size == 2 + assert c_io.input_size == 3 assert c_io.output_size == 2 + ins = [["data1", True], ["data2", False, LiteIOType.LITE_IO_SHAPE]] + outs = [["out1", True], ["out2", False, LiteIOType.LITE_IO_VALUE]] + + io2 = LiteNetworkIO(ins, outs) + assert len(io2.inputs) == 2 + assert len(io2.outputs) == 2 + + io3 = LiteNetworkIO([input_io1, input_io2], [output_io1, output_io2]) + assert len(io3.inputs) == 2 + assert len(io3.outputs) == 2 + + test_io = LiteIO("test") + assert test_io.name == "test" + test_io.name = "test2" + assert test_io.name == "test2" + class TestShuffleNet(unittest.TestCase): source_dir = os.getenv("LITE_TEST_RESOURCE") @@ -319,9 +342,9 @@ class TestNetwork(TestShuffleNet): data = ios[key].to_numpy().flatten() input_data = self.input_data.flatten() assert data.size == input_data.size - assert io.name.decode("utf-8") == "data" + assert io.name == "data" for i in range(data.size): - assert data[i] == input_data[i] + assert abs(data[i] - input_data[i]) < 1e-5 return 0 network.set_start_callback(start_callback) @@ -343,7 +366,7 @@ class TestNetwork(TestShuffleNet): output_data = self.correct_data.flatten() assert data.size == output_data.size for i in range(data.size): - assert data[i] == output_data[i] + assert abs(data[i] - output_data[i]) < 1e-5 return 0 network.set_finish_callback(finish_callback) @@ -404,3 +427,27 @@ class TestNetwork(TestShuffleNet): binary_equal_between_batch=True, ) self.do_forward(network) + + def test_device_tensor_no_copy(self): + # construct LiteOption + net_config = LiteConfig() + net_config.options.force_output_use_user_specified_memory = True + + network = LiteNetwork(config=net_config) + network.load(self.model_path) + + input_tensor = network.get_io_tensor("data") + # fill input_data with device data + input_tensor.set_data_by_share(self.input_data) + + output_tensor = network.get_io_tensor(network.get_output_name(0)) + out_array = np.zeros(output_tensor.layout.shapes, output_tensor.layout.dtype) + + output_tensor.set_data_by_share(out_array) + + # inference + for i in range(2): + network.forward() + network.wait() + + self.check_correct(out_array) diff --git a/lite/pylite/test/test_tensor.py b/lite/pylite/test/test_tensor.py index 6232d7f8..86af29c3 100644 --- a/lite/pylite/test/test_tensor.py +++ b/lite/pylite/test/test_tensor.py @@ -54,6 +54,16 @@ def test_tensor_make(): tensor = LiteTensor(layout, device_id=1) assert tensor.device_id == 1 + tensor.layout = [8, 14] + assert tensor.layout.shapes[0] == 8 + assert tensor.layout.shapes[1] == 14 + assert tensor.layout.data_type == LiteDataType.LITE_FLOAT + + tensor_new = LiteTensor(shapes=[1, 3, 224], dtype=np.int8) + assert tensor_new.layout.shapes[1] == 3 + assert tensor_new.layout.shapes[2] == 224 + assert tensor_new.layout.data_type == LiteDataType.LITE_INT8 + def test_tensor_set_data(): layout = LiteLayout([2, 16], "int8") @@ -292,3 +302,24 @@ def test_tensor_concat(): for i in range(128): index = j * 128 + i assert real_data[index // 32][index % 32] == j + + +def test_tensor_get_memory_by_share(): + layout = LiteLayout([4, 32], "int16") + tensor = LiteTensor(layout) + assert tensor.nbytes == 4 * 32 * 2 + + arr = np.ones([4, 32], "int16") + for i in range(128): + arr[i // 32][i % 32] = i + tensor.set_data_by_copy(arr) + test_data = tensor.get_data_by_share() + real_data = tensor.to_numpy() + for i in range(128): + assert real_data[i // 32][i % 32] == test_data[i // 32][i % 32] + + arr[1][18] = 5 + arr[3][7] = 345 + tensor.set_data_by_copy(arr) + assert test_data[1][18] == 5 + assert test_data[3][7] == 345 diff --git a/lite/test/CMakeLists.txt b/lite/test/CMakeLists.txt index 1bf6c836..dce10aaf 100644 --- a/lite/test/CMakeLists.txt +++ b/lite/test/CMakeLists.txt @@ -1,27 +1,28 @@ -if (MGE_WITH_TEST) - file (GLOB_RECURSE SOURCES ./*.cpp main.cpp) - add_executable (lite_test ${SOURCES}) +if(MGE_WITH_TEST) + file(GLOB_RECURSE SOURCES ./*.cpp main.cpp) + add_executable(lite_test ${SOURCES}) - target_link_libraries(lite_test gtest) - target_link_libraries(lite_test lite_static) - if(LITE_BUILD_WITH_MGE) - # lite_test will depends megbrain interface - target_link_libraries(lite_test megbrain) - if (MGE_WITH_ROCM) - # FIXME: hip obj can not find cpp obj only through lite_static - target_link_libraries(lite_test megdnn) - endif () + target_link_libraries(lite_test gtest) + target_link_libraries(lite_test lite_static) + if(LITE_BUILD_WITH_MGE) + # lite_test will depends megbrain interface + target_link_libraries(lite_test megbrain) + if(MGE_WITH_ROCM) + # FIXME: hip obj can not find cpp obj only through lite_static + target_link_libraries(lite_test megdnn) endif() + endif() - if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(lite_test dl) - else() - target_link_libraries(lite_test dl rt) - endif() + if(UNIX) + if(APPLE OR ANDROID) + target_link_libraries(lite_test dl) + else() + target_link_libraries(lite_test dl rt) endif() + endif() - install (TARGETS lite_test - EXPORT ${LITE_EXPORT_TARGETS} - RUNTIME DESTINATION lite/bin) + install( + TARGETS lite_test + EXPORT ${LITE_EXPORT_TARGETS} + RUNTIME DESTINATION lite/bin) endif() diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh index c411c67d..c20559a7 100755 --- a/scripts/whl/macos/macos_build_whl.sh +++ b/scripts/whl/macos/macos_build_whl.sh @@ -171,6 +171,7 @@ function do_build() { mkdir -p staging cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ + cp -a ${SRC_DIR}/src/custom/include staging/megengine/core/include/ cd ${BUILD_DIR}/staging/megengine/core rt_file=`ls _imperative_rt.*.so` echo "rt file is: ${rt_file}" diff --git a/scripts/whl/manylinux2014/do_build_common.sh b/scripts/whl/manylinux2014/do_build_common.sh index 2df0dbff..0f149724 100755 --- a/scripts/whl/manylinux2014/do_build_common.sh +++ b/scripts/whl/manylinux2014/do_build_common.sh @@ -151,6 +151,7 @@ do rm -rf staging mkdir -p staging cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ + cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include cd ${BUILD_DIR}/staging/megengine/core mkdir -p lib/ucx diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh index b3824fbc..d33cb5c5 100755 --- a/scripts/whl/windows/windows_build_whl.sh +++ b/scripts/whl/windows/windows_build_whl.sh @@ -77,11 +77,13 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll" CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll" CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll" -MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll" +MGE_EXPORT_DLL="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll" +MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.lib" function depend_real_copy() { REAL_DST=$1 echo "real copy lib to $1" + cp "${MGE_EXPORT_DLL}" ${REAL_DST} cp "${MGE_EXPORT_LIB}" ${REAL_DST} if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then @@ -190,6 +192,7 @@ function do_build() { rm -rf staging mkdir -p staging cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ + cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include/ cd ${BUILD_DIR}/staging/megengine/core rt_file=`ls _imperative_rt.*.pyd` echo "rt file is: ${rt_file}" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 57664c74..807f44f1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,251 +1,288 @@ +# force define a SHARED target for whl, caused by when build for APPLE we will force set +# BUILD_SHARED_LIBS=OFF for xcode needed +set(MGE_SHARED_LIB megengine_shared) +set(MGE_SHARED_LIB + ${MGE_SHARED_LIB} + PARENT_SCOPE) + if(MGE_WITH_JIT_MLIR) - add_subdirectory(jit/include/megbrain/jit/mlir/ir) + add_subdirectory(jit/include/megbrain/jit/mlir/ir) endif() -file(GLOB_RECURSE SOURCES core/impl/*.cpp gopt/impl/*.cpp opr/impl/*.cpp opr/impl/nvof/*.cpp plugin/impl/*.cpp serialization/impl/*.cpp core/impl/*.inl gopt/impl/*.inl opr/impl/*.inl plugin/impl/*.inl serialization/impl/*.inl) - +file( + GLOB_RECURSE + SOURCES + core/impl/*.cpp + gopt/impl/*.cpp + opr/impl/*.cpp + opr/impl/nvof/*.cpp + plugin/impl/*.cpp + serialization/impl/*.cpp + core/impl/*.inl + gopt/impl/*.inl + opr/impl/*.inl + plugin/impl/*.inl + serialization/impl/*.inl) if(MGE_WITH_JIT) - file(GLOB_RECURSE SOURCES_ jit/impl/*.cpp jit/impl/*.inl) - if(MGE_WITH_JIT_MLIR) - file(GLOB_RECURSE MLIR_SOURCES_ jit/impl/mlir/ir/*.cpp jit/impl/mlir/*.cpp) - list(APPEND SOURCES_ ${MLIR_SOURCES_}) - endif() - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ jit/impl/*.cpp jit/impl/*.inl) + if(MGE_WITH_JIT_MLIR) + file(GLOB_RECURSE MLIR_SOURCES_ jit/impl/mlir/ir/*.cpp jit/impl/mlir/*.cpp) + list(APPEND SOURCES_ ${MLIR_SOURCES_}) + endif() + list(APPEND SOURCES ${SOURCES_}) endif() if(MGE_WITH_DISTRIBUTED) - file(GLOB_RECURSE SOURCES_ opr-mm/impl/*.cpp opr-mm/impl/*.inl) - list(APPEND SOURCES ${SOURCES_}) - file(GLOB_RECURSE PROTO_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../src/opr-mm/proto/*.proto") - PROTOBUF_GENERATE_CPP_WITH_ROOT(GRPC_SRCS GRPC_HDRS ${CMAKE_CURRENT_SOURCE_DIR} ${PROTO_FILES}) - add_custom_target(mgb_proto_target DEPENDS ${GRPC_SRCS} ${GRPC_HDRS} ${PROTOBUF_PROTOC_EXECUTABLE}) - list(APPEND SOURCES ${GRPC_SRCS}) + file(GLOB_RECURSE SOURCES_ opr-mm/impl/*.cpp opr-mm/impl/*.inl) + list(APPEND SOURCES ${SOURCES_}) + file( + GLOB_RECURSE PROTO_FILES + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "../src/opr-mm/proto/*.proto") + protobuf_generate_cpp_with_root(GRPC_SRCS GRPC_HDRS ${CMAKE_CURRENT_SOURCE_DIR} + ${PROTO_FILES}) + add_custom_target(mgb_proto_target DEPENDS ${GRPC_SRCS} ${GRPC_HDRS} + ${PROTOBUF_PROTOC_EXECUTABLE}) + list(APPEND SOURCES ${GRPC_SRCS}) endif() -set(MGB_INC ${PROJECT_BINARY_DIR}/genfiles ${CMAKE_CURRENT_LIST_DIR}/core/include ${CMAKE_CURRENT_LIST_DIR}/gopt/include ${CMAKE_CURRENT_LIST_DIR}/opr/include ${CMAKE_CURRENT_LIST_DIR}/plugin/include ${CMAKE_CURRENT_LIST_DIR}/serialization/include) +set(MGB_INC + ${PROJECT_BINARY_DIR}/genfiles + ${CMAKE_CURRENT_LIST_DIR}/core/include + ${CMAKE_CURRENT_LIST_DIR}/gopt/include + ${CMAKE_CURRENT_LIST_DIR}/opr/include + ${CMAKE_CURRENT_LIST_DIR}/plugin/include + ${CMAKE_CURRENT_LIST_DIR}/serialization/include) if(MGE_WITH_JIT) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include) - if(MGE_WITH_CUDA) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/impl/cuda) - endif() + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include) + if(MGE_WITH_CUDA) + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/impl/cuda) + endif() endif() if(MGE_WITH_DISTRIBUTED) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/opr-mm/include) + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/opr-mm/include) endif() if(MGE_WITH_CUDA AND MGE_WITH_TRT) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/tensorrt/include) - file(GLOB_RECURSE SOURCES_ tensorrt/impl/*.cpp tensorrt/impl/*.inl) - list(APPEND SOURCES ${SOURCES_}) + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/tensorrt/include) + file(GLOB_RECURSE SOURCES_ tensorrt/impl/*.cpp tensorrt/impl/*.inl) + list(APPEND SOURCES ${SOURCES_}) endif() if(MGE_WITH_CAMBRICON) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/cambricon/include) - file(GLOB_RECURSE SOURCES_ cambricon/impl/*.cpp cambricon/impl/*.inl) - list(APPEND SOURCES ${SOURCES_}) + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/cambricon/include) + file(GLOB_RECURSE SOURCES_ cambricon/impl/*.cpp cambricon/impl/*.inl) + list(APPEND SOURCES ${SOURCES_}) endif() set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON}) set(MGB_ATLAS ${MGE_WITH_ATLAS}) if(MGE_WITH_CUDA) - file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu) + list(APPEND SOURCES ${SOURCES_}) endif() if(MGE_WITH_CUSTOM_OP) - list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/custom/include) - file(GLOB_RECURSE SOURCES_ custom/impl/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/custom/include) + file(GLOB_RECURSE SOURCES_ custom/impl/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() add_library(megbrain OBJECT ${SOURCES}) target_link_libraries(megbrain PUBLIC mgb_opr_param_defs) if(MGE_WITH_CUDA) - target_include_directories(megbrain PUBLIC ${TRT_INCLUDE_DIR}) - target_include_directories(megbrain PRIVATE ${CUDNN_INCLUDE_DIR}) - find_path(NVTX3_INCLUDE - NAMES nvToolsExtCudaRt.h - HINTS $ENV{CUDA_ROOT_DIR} $ENV{CUDA_PATH} $ENV{CUDA_BIN_PATH} - PATH_SUFFIXES include/nvtx3 - DOC "NVTX3_INCLUDE" ) - if(NVTX3_INCLUDE STREQUAL "NVTX3_INCLUDE-NOTFOUND") - message(FATAL_ERROR "Can not find NVTX3 INCLUDE, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH") - endif() - target_include_directories(megbrain PRIVATE ${NVTX3_INCLUDE}) -endif() -target_include_directories(megbrain - PUBLIC $ - PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src -) - -foreach (INCPATH IN LISTS MGB_INC) - target_include_directories(megbrain - PUBLIC $ + target_include_directories(megbrain PUBLIC ${TRT_INCLUDE_DIR}) + target_include_directories(megbrain PRIVATE ${CUDNN_INCLUDE_DIR}) + find_path( + NVTX3_INCLUDE + NAMES nvToolsExtCudaRt.h + HINTS $ENV{CUDA_ROOT_DIR} $ENV{CUDA_PATH} $ENV{CUDA_BIN_PATH} + PATH_SUFFIXES include/nvtx3 + DOC "NVTX3_INCLUDE") + if(NVTX3_INCLUDE STREQUAL "NVTX3_INCLUDE-NOTFOUND") + message( + FATAL_ERROR + "Can not find NVTX3 INCLUDE, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" ) + endif() + target_include_directories(megbrain PRIVATE ${NVTX3_INCLUDE}) +endif() +target_include_directories( + megbrain + PUBLIC $ + PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src) + +foreach(INCPATH IN LISTS MGB_INC) + target_include_directories(megbrain PUBLIC $) endforeach() if(MGE_WITH_CUDA) - if(NOT WIN32 AND NOT MSVC) - target_compile_options(megbrain PRIVATE "$<$:-Xcompiler=-Wno-unused-parameter>" - "$<$>:-Wno-unused-parameter>") - endif() + if(NOT WIN32 AND NOT MSVC) + target_compile_options( + megbrain PRIVATE "$<$:-Xcompiler=-Wno-unused-parameter>" + "$<$>:-Wno-unused-parameter>") + endif() else() - target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") + target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") endif() if(CXX_SUPPORT_WCLASS_MEMACCESS) - if(MGE_WITH_CUDA) - target_compile_options(megbrain PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" - "$<$>:-Wno-class-memaccess>") - else() - target_compile_options(megbrain PRIVATE "-Wno-class-memaccess") - endif() + if(MGE_WITH_CUDA) + target_compile_options( + megbrain PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" + "$<$>:-Wno-class-memaccess>") + else() + target_compile_options(megbrain PRIVATE "-Wno-class-memaccess") + endif() endif() target_link_libraries(megbrain PUBLIC megdnn) if(MGE_WITH_DISTRIBUTED) - add_dependencies(megbrain mgb_proto_target) - target_link_libraries (megbrain PRIVATE libprotobuf libzmq) - set(CPPZMQ_INC ${PROJECT_SOURCE_DIR}/third_party/cppzmq) - # FIXME: add CMAKE_CURRENT_BINARY_DIR for including mm_handler.pb.h - target_include_directories(megbrain PRIVATE ${CPPZMQ_INC} ${CMAKE_CURRENT_BINARY_DIR}) - target_link_libraries (megbrain PRIVATE megray) + add_dependencies(megbrain mgb_proto_target) + target_link_libraries(megbrain PRIVATE libprotobuf libzmq) + set(CPPZMQ_INC ${PROJECT_SOURCE_DIR}/third_party/cppzmq) + # FIXME: add CMAKE_CURRENT_BINARY_DIR for including mm_handler.pb.h + target_include_directories(megbrain PRIVATE ${CPPZMQ_INC} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(megbrain PRIVATE megray) endif() target_link_libraries(megbrain PUBLIC ${MGE_CAMBRICON_LIBS}) target_link_libraries(megbrain PUBLIC ${MGE_ATLAS_LIBS}) if(MGE_WITH_JIT AND MGE_WITH_HALIDE) - target_link_libraries(megbrain PRIVATE libhalide) - target_link_libraries(megbrain PRIVATE ${HALIDE_LLVM_LIBS}) + target_link_libraries(megbrain PRIVATE libhalide) + target_link_libraries(megbrain PRIVATE ${HALIDE_LLVM_LIBS}) endif() if(MGE_WITH_JIT_MLIR) - target_include_directories(megbrain PRIVATE ${MLIR_LLVM_INCLUDE_DIR}) - target_link_libraries(megbrain PRIVATE ${MLIR_LLVM_LIBS}) - add_dependencies(megbrain mgb_dialect) - target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/include) -endif() -if (MGB_WITH_FLATBUFFERS) - set (GEN_FLATBUFFERS_SCHEMA_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_schema.py) - set (OPR_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py) - set (MGB_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py) - file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl) - add_custom_command( - OUTPUT + target_include_directories(megbrain PRIVATE ${MLIR_LLVM_INCLUDE_DIR}) + target_link_libraries(megbrain PRIVATE ${MLIR_LLVM_LIBS}) + add_dependencies(megbrain mgb_dialect) + target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/include) +endif() +if(MGB_WITH_FLATBUFFERS) + set(GEN_FLATBUFFERS_SCHEMA_PY + ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_schema.py) + set(OPR_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py) + set(MGB_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs + COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs - COMMAND - ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs - DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} - VERBATIM - ) - add_custom_command( - OUTPUT + DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} + VERBATIM) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs + COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs - COMMAND - ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs - DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY} - VERBATIM - ) - list(APPEND FLATBUFFERS_SCHEMA_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/dtype.fbs - ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs - ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs - ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl/mgb_cpp_opr.fbs - ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/schema.fbs - ) - list(APPEND FLATBUFFERS_SCHEMA_INCLUDE_DIR - ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl - ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl - ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl - ) - build_flatbuffers( - "${FLATBUFFERS_SCHEMA_FILES}" - "${FLATBUFFERS_SCHEMA_INCLUDE_DIR}" - mgb_serialization_schema_fbs - "${FLATBUFFERS_SCHEMA_FILES}" - "${CMAKE_CURRENT_BINARY_DIR}/serialization/include/megbrain/serialization/internal" - "" - "" - ) - add_dependencies(megbrain mgb_serialization_schema_fbs) - target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) - target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) - target_link_libraries(megbrain PUBLIC flatbuffers) - set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) - set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) - file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) - add_custom_command( - OUTPUT + DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY} + VERBATIM) + list( + APPEND + FLATBUFFERS_SCHEMA_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/dtype.fbs + ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs + ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs + ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl/mgb_cpp_opr.fbs + ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/schema.fbs) + list( + APPEND FLATBUFFERS_SCHEMA_INCLUDE_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl + ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl) + build_flatbuffers( + "${FLATBUFFERS_SCHEMA_FILES}" + "${FLATBUFFERS_SCHEMA_INCLUDE_DIR}" + mgb_serialization_schema_fbs + "${FLATBUFFERS_SCHEMA_FILES}" + "${CMAKE_CURRENT_BINARY_DIR}/serialization/include/megbrain/serialization/internal" + "" + "") + add_dependencies(megbrain mgb_serialization_schema_fbs) + target_include_directories(megbrain + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) + target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) + target_link_libraries(megbrain PUBLIC flatbuffers) + set(GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) + set(GEN_FLATBUFFERS_CONVERTER_PY + ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) + file(MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) + add_custom_command( + OUTPUT ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl + COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl - COMMAND - ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl - DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY} - VERBATIM - ) - add_custom_command( - OUTPUT + DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY} + VERBATIM) + add_custom_command( + OUTPUT ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl + COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl - COMMAND - ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl - DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY} - VERBATIM - ) - target_sources(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl) - target_sources(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl) - target_include_directories(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) + DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY} + VERBATIM) + target_sources( + megbrain + PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl) + target_sources( + megbrain + PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl) + target_include_directories(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) endif() -if(UNIX AND NOT ANDROID AND NOT APPLE) - target_link_libraries(megbrain PUBLIC dl rt atomic) +if(UNIX + AND NOT ANDROID + AND NOT APPLE) + target_link_libraries(megbrain PUBLIC dl rt atomic) endif() if(ANDROID) - target_link_libraries(megbrain PUBLIC log) + target_link_libraries(megbrain PUBLIC log) endif() -set (_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld) +set(_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld) # Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF add_library(megengine) -# force define a SHARED target for whl, caused by when build for APPLE -# we will force set BUILD_SHARED_LIBS=OFF for xcode needed -add_library(megengine_shared SHARED) +add_library(${MGE_SHARED_LIB} SHARED) target_link_libraries(megengine PRIVATE ${MGE_CUDA_LIBS}) target_link_libraries(megengine PUBLIC megbrain megdnn) -target_link_libraries(megengine_shared PUBLIC megbrain megdnn) -target_link_libraries(megengine_shared PRIVATE ${MGE_CUDA_LIBS}) -if (UNIX AND NOT APPLE) - target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE}) - set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE}) - target_link_options(megengine_shared PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE}) - set_target_properties(megengine_shared PROPERTIES LINK_DEPENDS ${_VER_FILE}) +target_link_libraries(${MGE_SHARED_LIB} PUBLIC megbrain megdnn) +target_link_libraries(${MGE_SHARED_LIB} PRIVATE ${MGE_CUDA_LIBS}) +if(UNIX AND NOT APPLE) + target_link_options(megengine PRIVATE -Wl,--no-undefined + -Wl,--version-script=${_VER_FILE}) + set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE}) + target_link_options(${MGE_SHARED_LIB} PRIVATE -Wl,--no-undefined + -Wl,--version-script=${_VER_FILE}) + set_target_properties(${MGE_SHARED_LIB} PROPERTIES LINK_DEPENDS ${_VER_FILE}) endif() if(WIN32 OR MSVC) - target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT) - target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT) - target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT) - target_compile_definitions(megengine_shared PRIVATE MGE_DLL_EXPORT) - # please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build with CUDA - #set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) - #set_target_properties(megengine_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) -endif() -if (MGE_WITH_DISTRIBUTED) - message(VERBOSE "megengine configured to link megray") - target_link_libraries(megengine PUBLIC megray) - target_link_libraries(megengine_shared PUBLIC megray) -endif() -# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready -# for this. -install(TARGETS megengine - EXPORT ${MGE_EXPORT_TARGETS} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) - -if (NOT MGE_WITH_DISTRIBUTED) - install(TARGETS megbrain - EXPORT ${MGE_EXPORT_TARGETS} - ) + target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT) + target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT) + target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT) + target_compile_definitions(${MGE_SHARED_LIB} PRIVATE MGE_DLL_EXPORT) + # please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build + # with CUDA set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS + # TRUE) set_target_properties(${MGE_SHARED_LIB} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS + # TRUE) +endif() +if(MGE_WITH_DISTRIBUTED) + message(VERBOSE "megengine configured to link megray") + target_link_libraries(megengine PUBLIC megray) + target_link_libraries(${MGE_SHARED_LIB} PUBLIC megray) +endif() +# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready for this. +install( + TARGETS megengine + EXPORT ${MGE_EXPORT_TARGETS} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +if(NOT MGE_WITH_DISTRIBUTED) + install(TARGETS megbrain EXPORT ${MGE_EXPORT_TARGETS}) endif() foreach(_PATH ${MGB_INC}) - install(DIRECTORY ${_PATH}/megbrain DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h") + install( + DIRECTORY ${_PATH}/megbrain + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING + PATTERN "*.h") endforeach() diff --git a/src/core/impl/graph/cg_impl_seq.cpp b/src/core/impl/graph/cg_impl_seq.cpp index 40a6ef01..4a1c731d 100644 --- a/src/core/impl/graph/cg_impl_seq.cpp +++ b/src/core/impl/graph/cg_impl_seq.cpp @@ -539,7 +539,8 @@ void ComputingGraphImpl::ComputingSequence::do_regist() const { auto& mc = mp.chunk(); if (mp.valid() && mc.mem_alloc_status.is_from_owner_var()) { auto size = mgb::get_aligned_power2( - mc.size(), j->comp_node().get_mem_addr_alignment()); + mp.layout().span().dist_byte(), + j->comp_node().get_mem_addr_alignment()); recorder.regist_memory_chunk( {chunk_id++, size, 0, this->m_opr_seq->size(), diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index 56a281f7..787054a9 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -577,7 +577,7 @@ protected: virtual size_t get_max_reserved_memory() { return 0; } virtual size_t get_max_used_memory() { return 0; } virtual size_t get_max_block_size_available() { return 0; } - virtual size_t get_free_mem() { return 0; } + virtual size_t get_free_mem() { return get_mem_status_bytes().second; } virtual void reset_max_reserved_memory() {} virtual void reset_max_used_memory() {} #endif diff --git a/src/core/include/megbrain/graph/operator_node.h b/src/core/include/megbrain/graph/operator_node.h index 67b9c81b..df410da8 100644 --- a/src/core/include/megbrain/graph/operator_node.h +++ b/src/core/include/megbrain/graph/operator_node.h @@ -1013,13 +1013,13 @@ using OprNodeArray = SmallVector; * * Note that opening brace is included */ -#define MGB_DEFINE_OPR_CLASS(_name, _base, ...) \ - MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \ - MGB_DYN_TYPE_OBJ_FINAL_DECL; +#define MGB_DEFINE_OPR_CLASS(_name, _base, ...) \ + MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \ + MGB_DYN_TYPE_OBJ_FINAL_DECL; -#define MGB_DEFINE_OPR_CLASS_WITH_EXPORT(_name, _base, ...) \ - MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \ - MGB_DYN_TYPE_OBJ_FINAL_DECL_WITH_EXPORT; +#define MGB_DEFINE_OPR_CLASS_WITH_EXPORT(_name, _base, ...) \ + MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \ + MGB_DYN_TYPE_OBJ_FINAL_DECL_WITH_EXPORT; } // namespace cg } // namespace mgb diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td index 233c99f3..30c795ad 100644 --- a/src/core/include/megbrain/ir/ops.td +++ b/src/core/include/megbrain/ir/ops.td @@ -431,4 +431,21 @@ def Padding: MgbHashableOp<"Padding", [PaddingParam]>; def LRN: MgbHashableOp<"LRN", [LRNParam]>; +def LayerNorm: MgbHashableOp<"LayerNorm", [LayerNormParam]>; + +def Dropout: MgbHashableOp<"Dropout", [DropoutParam]> { + let extraArguments = (ins + MgbSizeTAddr:$handle + ); + let hashFunction = [{ + return mgb::hash_pair_combine( + mgb::hash($_self.dyn_typeinfo()), + mgb::hash_pair_combine( + mgb::hash($_self.drop_prob), + mgb::hash($_self.handle)) + ); + }]; + let cmpFunction = [{return $0.handle == $1.handle && $0.drop_prob == $1.drop_prob;}]; + +} #endif // MGB_OPS diff --git a/src/core/include/megbrain/utils/metahelper.h b/src/core/include/megbrain/utils/metahelper.h index ae566f02..9f886f1a 100644 --- a/src/core/include/megbrain/utils/metahelper.h +++ b/src/core/include/megbrain/utils/metahelper.h @@ -495,18 +495,18 @@ private: } // namespace mgb -#define _MGB_DEFINE_CLS_WITH_SUPER_IMPL(_tpl, _name, _base, ...) \ - class _name : public _base, ##__VA_ARGS__ { \ - public: \ - using Super = _tpl _base; \ - \ +#define MGB_DEFINE_CLS_WITH_SUPER_IMPL(_tpl, _name, _base, ...) \ + class _name : public _base, ##__VA_ARGS__ { \ + public: \ + using Super = _tpl _base; \ + \ private: /*! * \brief define a class which has Super defined to base */ #define MGB_DEFINE_CLS_WITH_SUPER(_name, _base, ...) \ - _MGB_DEFINE_CLS_WITH_SUPER_IMPL(, _name, _base, ##__VA_ARGS__) + MGB_DEFINE_CLS_WITH_SUPER_IMPL(, _name, _base, ##__VA_ARGS__) /*! * \brief define a class which has Super defined to base @@ -514,5 +514,5 @@ private: * Used when this class is a template and base class has template */ #define MGB_DEFINE_CLS_WITH_SUPER_TPL(_name, _base, ...) \ - _MGB_DEFINE_CLS_WITH_SUPER_IMPL(typename, _name, _base, ##__VA_ARGS__) + MGB_DEFINE_CLS_WITH_SUPER_IMPL(typename, _name, _base, ##__VA_ARGS__) // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/custom/impl/manager.cpp b/src/custom/impl/manager.cpp index 3de0986f..39419d7e 100644 --- a/src/custom/impl/manager.cpp +++ b/src/custom/impl/manager.cpp @@ -18,12 +18,31 @@ #ifndef _WIN32 #include +#else +#include #endif using namespace mgb; namespace custom { +#ifdef _WIN32 +#define RTLD_LAZY 0 + +void* dlopen(const char* file, int) { + return static_cast(LoadLibrary(file)); +} + +int dlclose(void* handle) { + return static_cast(FreeLibrary(static_cast(handle))); +} + +const char* dlerror(void) { + static char win_err_info[] = "no dlerror info in windows"; + return win_err_info; +} +#endif + CustomOpManager* CustomOpManager::inst(void) { static CustomOpManager op_manager; return &op_manager; @@ -127,7 +146,6 @@ std::vector CustomOpManager::op_id_list(void) { return ret; } -#ifndef _WIN32 CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY) : m_handle(nullptr, [](void* handle) { dlclose(handle); }) { auto op_list_before_load = CustomOpManager::inst()->op_name_list(); @@ -146,12 +164,6 @@ CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY) } } } -#else -CustomLib::CustomLib(const std::string& path, int mode = 0) - : m_handle(nullptr, [](void* handle) {}) { - mgb_assert(false, "custom op is only supported on Linux now"); -} -#endif const std::vector& CustomLib::ops_in_lib(void) const { return m_ops; diff --git a/src/custom/include/megbrain/custom/custom.h b/src/custom/include/megbrain/custom/custom.h index e6751f25..726076a7 100644 --- a/src/custom/include/megbrain/custom/custom.h +++ b/src/custom/include/megbrain/custom/custom.h @@ -16,7 +16,8 @@ #include "tensor.h" namespace custom { -std::shared_ptr op_insert(std::string opname, uint32_t version); +MGE_WIN_DECLSPEC_FUC std::shared_ptr op_insert( + std::string opname, uint32_t version); } #define CUSTOM_OP_REG(OpName) \ diff --git a/src/custom/include/megbrain/custom/op.h b/src/custom/include/megbrain/custom/op.h index 2646ce56..b1afc801 100644 --- a/src/custom/include/megbrain/custom/op.h +++ b/src/custom/include/megbrain/custom/op.h @@ -32,27 +32,26 @@ namespace custom { using RunTimeId = uint64_t; -class ArgInfo { +class MGE_WIN_DECLSPEC_FUC ArgInfo { CUSTOM_PIMPL_CLS_DECL(ArgInfo); - MGE_WIN_DECLSPEC_FUC ArgInfo( - const std::string& name, const std::string& desc, + ArgInfo(const std::string& name, const std::string& desc, const std::unordered_set& dtypes, const int& ndim, const std::string& mem_stgy); - MGE_WIN_DECLSPEC_FUC const std::string& name(void) const; - MGE_WIN_DECLSPEC_FUC const std::string& desc(void) const; - MGE_WIN_DECLSPEC_FUC const std::unordered_set& dtypes(void) const; - MGE_WIN_DECLSPEC_FUC int ndim(void) const; - MGE_WIN_DECLSPEC_FUC const std::string& mem_strategy(void) const; + const std::string& name(void) const; + const std::string& desc(void) const; + const std::unordered_set& dtypes(void) const; + int ndim(void) const; + const std::string& mem_strategy(void) const; - MGE_WIN_DECLSPEC_FUC std::string str() const; + std::string str() const; }; -class CustomOp { +class MGE_WIN_DECLSPEC_FUC CustomOp { std::unique_ptr m_impl; public: - MGE_WIN_DECLSPEC_FUC CustomOp(const std::string& op_type, uint32_t version); + CustomOp(const std::string& op_type, uint32_t version); PREVENT_COPY_AND_ASSIGN(CustomOp); using DeviceInferFuncPtr = @@ -71,70 +70,65 @@ public: void (*)(const std::vector&, const Param&, std::vector&); // write for forward - MGE_WIN_DECLSPEC_FUC CustomOp& set_device_infer(DeviceInferFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_shape_infer(ShapeInferFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_dtype_infer(DTypeInferFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_format_infer(FormatInferFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(PreprocessFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess( - const std::string& device, PreprocessFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(PostprocessFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess( - const std::string& device, PostprocessFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(ComputeFuncPtr func); - MGE_WIN_DECLSPEC_FUC CustomOp& set_compute( - const std::string& device, ComputeFuncPtr func); - - MGE_WIN_DECLSPEC_FUC CustomOp& set_description(const std::string& op_desc); - MGE_WIN_DECLSPEC_FUC CustomOp& add_input( + CustomOp& set_device_infer(DeviceInferFuncPtr func); + CustomOp& set_shape_infer(ShapeInferFuncPtr func); + CustomOp& set_dtype_infer(DTypeInferFuncPtr func); + CustomOp& set_format_infer(FormatInferFuncPtr func); + CustomOp& set_preprocess(PreprocessFuncPtr func); + CustomOp& set_preprocess(const std::string& device, PreprocessFuncPtr func); + CustomOp& set_postprocess(PostprocessFuncPtr func); + CustomOp& set_postprocess(const std::string& device, PostprocessFuncPtr func); + CustomOp& set_compute(ComputeFuncPtr func); + CustomOp& set_compute(const std::string& device, ComputeFuncPtr func); + + CustomOp& set_description(const std::string& op_desc); + CustomOp& add_input( const std::string& name, const std::string& desc, const std::initializer_list& legal_dtypes = {"float32"}, int dims = -1, const std::string& mem_stgy = "default"); - MGE_WIN_DECLSPEC_FUC CustomOp& add_output( + CustomOp& add_output( const std::string& name, const std::string& desc, const std::initializer_list& legal_dtypes = {"float32"}, int dims = -1, const std::string& mem_stgy = "default"); - MGE_WIN_DECLSPEC_FUC CustomOp& add_input( + CustomOp& add_input( const std::string& name, const std::initializer_list& legal_dtypes = {"float32"}, int dims = -1, const std::string& mem_stgy = "default"); - MGE_WIN_DECLSPEC_FUC CustomOp& add_output( + CustomOp& add_output( const std::string& name, const std::initializer_list& legal_dtypes = {"float32"}, int dims = -1, const std::string& mem_stgy = "default"); - MGE_WIN_DECLSPEC_FUC CustomOp& add_inputs(const size_t& input_num); - MGE_WIN_DECLSPEC_FUC CustomOp& add_outputs(const size_t& output_num); - MGE_WIN_DECLSPEC_FUC CustomOp& add_param( - const std::string& name, const ParamVal& default_val); - MGE_WIN_DECLSPEC_FUC CustomOp& add_param( + CustomOp& add_inputs(const size_t& input_num); + CustomOp& add_outputs(const size_t& output_num); + CustomOp& add_param(const std::string& name, const ParamVal& default_val); + CustomOp& add_param( const std::string& name, const std::string& desc, const ParamVal& default_val); // read - MGE_WIN_DECLSPEC_FUC std::string op_type(void) const; - MGE_WIN_DECLSPEC_FUC std::string op_desc(void) const; - MGE_WIN_DECLSPEC_FUC RunTimeId runtime_id(void) const; - MGE_WIN_DECLSPEC_FUC size_t input_num(void) const; - MGE_WIN_DECLSPEC_FUC size_t output_num(void) const; - MGE_WIN_DECLSPEC_FUC std::string str(void) const; - - MGE_WIN_DECLSPEC_FUC const ParamInfo& param_info(void) const; - MGE_WIN_DECLSPEC_FUC ArgInfo input_info(size_t idx) const; - MGE_WIN_DECLSPEC_FUC ArgInfo output_info(size_t idx) const; - MGE_WIN_DECLSPEC_FUC const std::vector& inputs_info(void) const; - MGE_WIN_DECLSPEC_FUC const std::vector& outputs_info(void) const; + std::string op_type(void) const; + std::string op_desc(void) const; + RunTimeId runtime_id(void) const; + size_t input_num(void) const; + size_t output_num(void) const; + std::string str(void) const; + + const ParamInfo& param_info(void) const; + ArgInfo input_info(size_t idx) const; + ArgInfo output_info(size_t idx) const; + const std::vector& inputs_info(void) const; + const std::vector& outputs_info(void) const; // use - MGE_WIN_DECLSPEC_FUC std::vector infer_output_device( + std::vector infer_output_device( const std::vector&, const Param&) const; - MGE_WIN_DECLSPEC_FUC std::vector infer_output_shape( + std::vector infer_output_shape( const std::vector&, const Param&) const; - MGE_WIN_DECLSPEC_FUC std::vector infer_output_dtype( + std::vector infer_output_dtype( const std::vector&, const Param&) const; - MGE_WIN_DECLSPEC_FUC std::vector infer_output_format( + std::vector infer_output_format( const std::vector&, const Param&) const; - MGE_WIN_DECLSPEC_FUC void compute( - const std::vector&, const Param&, std::vector&) const; + void compute(const std::vector&, const Param&, std::vector&) const; }; } // namespace custom diff --git a/src/custom/include/megbrain/custom/param.h b/src/custom/include/megbrain/custom/param.h index d895d913..f90a2674 100644 --- a/src/custom/include/megbrain/custom/param.h +++ b/src/custom/include/megbrain/custom/param.h @@ -23,7 +23,7 @@ class ParamInfoImpl; class ParamImpl; // Schema of a param element -class ParamSchema { +class MGE_WIN_DECLSPEC_FUC ParamSchema { CUSTOM_PIMPL_CLS_DECL(ParamSchema); ParamSchema( const std::string& name, const ParamVal& value, @@ -36,7 +36,7 @@ class ParamSchema { std::string str(void) const; }; -class ParamInfo { +class MGE_WIN_DECLSPEC_FUC ParamInfo { CUSTOM_PIMPL_CLS_DECL(ParamInfo); void set_tag(const std::string&); @@ -46,16 +46,16 @@ class ParamInfo { const std::vector& meta(void) const; }; -class Param { +class MGE_WIN_DECLSPEC_FUC Param { CUSTOM_PIMPL_CLS_DECL(Param); - MGE_WIN_DECLSPEC_FUC Param(const ParamInfo&); - MGE_WIN_DECLSPEC_FUC ParamVal& operator[](const std::string&); - MGE_WIN_DECLSPEC_FUC const ParamVal& operator[](const std::string&) const; - MGE_WIN_DECLSPEC_FUC const std::unordered_map& raw() const; - MGE_WIN_DECLSPEC_FUC bool exist(const std::string& name) const; - MGE_WIN_DECLSPEC_FUC std::string to_bytes(void) const; - MGE_WIN_DECLSPEC_FUC void from_bytes(const std::string&); + Param(const ParamInfo&); + ParamVal& operator[](const std::string&); + const ParamVal& operator[](const std::string&) const; + const std::unordered_map& raw() const; + bool exist(const std::string& name) const; + std::string to_bytes(void) const; + void from_bytes(const std::string&); }; MGE_WIN_DECLSPEC_FUC bool operator==(const Param&, const Param&); diff --git a/src/custom/include/megbrain/custom/param_val.h b/src/custom/include/megbrain/custom/param_val.h index 31b2a4b6..d7f3b521 100644 --- a/src/custom/include/megbrain/custom/param_val.h +++ b/src/custom/include/megbrain/custom/param_val.h @@ -169,21 +169,21 @@ std::string vec2str(const std::vector& vec) { * Con1: user need to set the type explicitly when class template instantiation * Con2: ParamVal can not be assigned to ParamVal */ -class ParamVal { +class MGE_WIN_DECLSPEC_FUC ParamVal { std::unique_ptr m_ptr; ParamDynType m_type; public: template - MGE_WIN_DECLSPEC_FUC ParamVal(const T& val); + ParamVal(const T& val); template - MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list& val); + ParamVal(const std::initializer_list& val); - MGE_WIN_DECLSPEC_FUC ParamVal(); - MGE_WIN_DECLSPEC_FUC ParamVal(const char* str); - MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list& strs); - MGE_WIN_DECLSPEC_FUC ParamVal(const std::vector& strs); - MGE_WIN_DECLSPEC_FUC ParamVal(const ParamVal& rhs); + ParamVal(); + ParamVal(const char* str); + ParamVal(const std::initializer_list& strs); + ParamVal(const std::vector& strs); + ParamVal(const ParamVal& rhs); template ParamVal& operator=(const T& rhs); @@ -196,30 +196,39 @@ public: ParamVal& operator=(const ParamVal& rhs); template - MGE_WIN_DECLSPEC_FUC const T& as(void) const; + const T& as(void) const; template - MGE_WIN_DECLSPEC_FUC T& as(void); - - MGE_WIN_DECLSPEC_FUC const void* raw_ptr(void) const; - MGE_WIN_DECLSPEC_FUC void* raw_ptr(void); - MGE_WIN_DECLSPEC_FUC ParamDynType type(void) const; - MGE_WIN_DECLSPEC_FUC std::string str(void) const; - MGE_WIN_DECLSPEC_FUC size_t size(void) const; - - MGE_WIN_DECLSPEC_FUC static std::string to_bytes(const ParamVal& value); - MGE_WIN_DECLSPEC_FUC static ParamVal from_bytes( - const std::string& bytes, size_t& offset); - - friend ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs); - friend ParamVal operator-(const ParamVal& lhs, const ParamVal& rhs); - friend ParamVal operator*(const ParamVal& lhs, const ParamVal& rhs); - friend ParamVal operator/(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator==(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator!=(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator>(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator<(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator>=(const ParamVal& lhs, const ParamVal& rhs); - friend bool operator<=(const ParamVal& lhs, const ParamVal& rhs); + T& as(void); + + const void* raw_ptr(void) const; + void* raw_ptr(void); + ParamDynType type(void) const; + std::string str(void) const; + size_t size(void) const; + + static std::string to_bytes(const ParamVal& value); + static ParamVal from_bytes(const std::string& bytes, size_t& offset); + + MGE_WIN_DECLSPEC_FUC friend ParamVal operator+( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend ParamVal operator-( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend ParamVal operator*( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend ParamVal operator/( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator!=( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator>( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator<( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator>=( + const ParamVal& lhs, const ParamVal& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator<=( + const ParamVal& lhs, const ParamVal& rhs); }; ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs); diff --git a/src/custom/include/megbrain/custom/tensor.h b/src/custom/include/megbrain/custom/tensor.h index a1dd9ba5..53c54dc8 100644 --- a/src/custom/include/megbrain/custom/tensor.h +++ b/src/custom/include/megbrain/custom/tensor.h @@ -30,9 +30,9 @@ namespace custom { #define CUSTOM_DEVICE_TYPE_ENUM_DECL(custom_type, builtin_type, builtin_str) \ custom_type, -class Device { - MGE_WIN_DECLSPEC_FUC const void* impl() const; - MGE_WIN_DECLSPEC_FUC Device(const void* impl); +class MGE_WIN_DECLSPEC_FUC Device { + const void* impl() const; + Device(const void* impl); CUSTOM_PIMPL_CLS_DECL(Device); public: @@ -40,19 +40,19 @@ public: CUSTOM_FOR_EACH_DEVICE_TYPE(CUSTOM_DEVICE_TYPE_ENUM_DECL) }; - MGE_WIN_DECLSPEC_FUC Device(const std::string& device); - MGE_WIN_DECLSPEC_FUC Device(const char* device); - MGE_WIN_DECLSPEC_FUC Device(DeviceEnum device); + Device(const std::string& device); + Device(const char* device); + Device(DeviceEnum device); - MGE_WIN_DECLSPEC_FUC std::string str(void) const; - MGE_WIN_DECLSPEC_FUC DeviceEnum enumv(void) const; + std::string str(void) const; + DeviceEnum enumv(void) const; - MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& device); - MGE_WIN_DECLSPEC_FUC static bool is_legal(DeviceEnum device); - MGE_WIN_DECLSPEC_FUC static std::vector legal_devices(void); + static bool is_legal(const std::string& device); + static bool is_legal(DeviceEnum device); + static std::vector legal_devices(void); friend class Tensor; - friend bool operator==(const Device& lhs, const Device& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==(const Device& lhs, const Device& rhs); CUSTOM_DATA_ADAPTOR_FRIEND_DECL; }; @@ -60,23 +60,23 @@ using DeviceEnum = Device::DeviceEnum; bool operator==(const Device& lhs, const Device& rhs); -class Shape { - MGE_WIN_DECLSPEC_FUC const void* impl() const; - MGE_WIN_DECLSPEC_FUC Shape(const void* impl); +class MGE_WIN_DECLSPEC_FUC Shape { + const void* impl() const; + Shape(const void* impl); CUSTOM_PIMPL_CLS_DECL(Shape); public: - MGE_WIN_DECLSPEC_FUC Shape(const std::vector& rhs); - MGE_WIN_DECLSPEC_FUC Shape(const std::initializer_list& rhs); + Shape(const std::vector& rhs); + Shape(const std::initializer_list& rhs); size_t& operator[](size_t idx); size_t operator[](size_t idx) const; - MGE_WIN_DECLSPEC_FUC void ndim(size_t dim); - MGE_WIN_DECLSPEC_FUC size_t ndim(void) const; + void ndim(size_t dim); + size_t ndim(void) const; friend class Tensor; - friend bool operator==(const Shape& lhs, const Shape& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==(const Shape& lhs, const Shape& rhs); CUSTOM_DATA_ADAPTOR_FRIEND_DECL; }; @@ -104,9 +104,9 @@ using bfloat16_t = uint16_t; #define CUSTOM_DTYPE_ENUM_DECL(custom_type, builtin_type, ctype) custom_type, -class DType { - MGE_WIN_DECLSPEC_FUC const void* impl() const; - MGE_WIN_DECLSPEC_FUC DType(const void* impl); +class MGE_WIN_DECLSPEC_FUC DType { + const void* impl() const; + DType(const void* impl); CUSTOM_PIMPL_CLS_DECL(DType); public: @@ -114,27 +114,33 @@ public: CUSTOM_FOR_EACH_TENSOR_DATA_TYPE(CUSTOM_DTYPE_ENUM_DECL) }; - MGE_WIN_DECLSPEC_FUC DType(const std::string& dtype); - MGE_WIN_DECLSPEC_FUC DType(const char* dtype); - MGE_WIN_DECLSPEC_FUC DType( - const std::string& dtype, float scale, uint8_t zero_point = 0); - MGE_WIN_DECLSPEC_FUC DType(const char* dtype, float scale, uint8_t zero_point = 0); - MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype); - MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0); - - MGE_WIN_DECLSPEC_FUC std::string str(void) const; - MGE_WIN_DECLSPEC_FUC DTypeEnum enumv() const; - MGE_WIN_DECLSPEC_FUC float scale(void) const; - MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const; + DType(const std::string& dtype); + DType(const char* dtype); + DType(const std::string& dtype, float scale, uint8_t zero_point = 0); + DType(const char* dtype, float scale, uint8_t zero_point = 0); + DType(DTypeEnum dtype); + DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0); + + std::string str(void) const; + DTypeEnum enumv() const; + float scale(void) const; + uint8_t zero_point(void) const; template - MGE_WIN_DECLSPEC_FUC bool is_compatible(void) const; + bool is_compatible(void) const; - MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& dtype); - MGE_WIN_DECLSPEC_FUC static bool is_legal(const DTypeEnum& dtype); - MGE_WIN_DECLSPEC_FUC static std::vector legal_dtypes(void); + static bool is_legal(const std::string& dtype); + static bool is_legal(const DTypeEnum& dtype); + static std::vector legal_dtypes(void); friend class Tensor; - friend bool operator==(const DType& lhs, const DType& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const DType& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==( + const DType& lhs, const std::string& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const char* rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==( + const std::string& lhs, const DType& rhs); + MGE_WIN_DECLSPEC_FUC friend bool operator==(const char* lhs, const DType& rhs); + CUSTOM_DATA_ADAPTOR_FRIEND_DECL; }; @@ -180,45 +186,45 @@ bool operator==(const DType& lhs, const char* rhs); bool operator==(const std::string& lhs, const DType& rhs); bool operator==(const char* lhs, const DType& rhs); -class Format { - MGE_WIN_DECLSPEC_FUC const void* impl() const; - MGE_WIN_DECLSPEC_FUC Format(const void* impl); +class MGE_WIN_DECLSPEC_FUC Format { + const void* impl() const; + Format(const void* impl); CUSTOM_PIMPL_CLS_DECL(Format); public: - MGE_WIN_DECLSPEC_FUC Format(const std::string& format); - MGE_WIN_DECLSPEC_FUC Format(const char* format); + Format(const std::string& format); + Format(const char* format); - MGE_WIN_DECLSPEC_FUC std::string str(void) const; - MGE_WIN_DECLSPEC_FUC bool is_default(void) const; + std::string str(void) const; + bool is_default(void) const; friend class Tensor; CUSTOM_DATA_ADAPTOR_FRIEND_DECL; }; -class Tensor { +class MGE_WIN_DECLSPEC_FUC Tensor { void* m_tensor; - MGE_WIN_DECLSPEC_FUC const void* impl(void) const; - MGE_WIN_DECLSPEC_FUC Tensor(const void* impl); + const void* impl(void) const; + Tensor(const void* impl); - MGE_WIN_DECLSPEC_FUC const size_t* shapes_raw(void) const; - MGE_WIN_DECLSPEC_FUC const ptrdiff_t* strides_raw(void) const; + const size_t* shapes_raw(void) const; + const ptrdiff_t* strides_raw(void) const; public: Tensor() = delete; - MGE_WIN_DECLSPEC_FUC Tensor(const Tensor& rhs); - MGE_WIN_DECLSPEC_FUC Tensor& operator=(const Tensor& rhs); - - MGE_WIN_DECLSPEC_FUC Shape shape(void) const; - MGE_WIN_DECLSPEC_FUC DType dtype(void) const; - MGE_WIN_DECLSPEC_FUC Format format(void) const; - MGE_WIN_DECLSPEC_FUC Device device(void) const; - - MGE_WIN_DECLSPEC_FUC size_t size(void) const; - MGE_WIN_DECLSPEC_FUC std::vector stride(void) const; - MGE_WIN_DECLSPEC_FUC float scale(void) const; - MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const; + Tensor(const Tensor& rhs); + Tensor& operator=(const Tensor& rhs); + + Shape shape(void) const; + DType dtype(void) const; + Format format(void) const; + Device device(void) const; + + size_t size(void) const; + std::vector stride(void) const; + float scale(void) const; + uint8_t zero_point(void) const; void* data(void); const void* data(void) const; diff --git a/src/custom/include/megbrain/custom/utils.h b/src/custom/include/megbrain/custom/utils.h index 318bc62d..1bc64c6a 100644 --- a/src/custom/include/megbrain/custom/utils.h +++ b/src/custom/include/megbrain/custom/utils.h @@ -19,10 +19,19 @@ namespace custom { -void assert_failed_log( +#ifndef MGE_WIN_DECLSPEC_FUC +#ifdef _WIN32 +#define MGE_WIN_DECLSPEC_FUC __declspec(dllexport) +#else +#define MGE_WIN_DECLSPEC_FUC +#endif +#endif + +MGE_WIN_DECLSPEC_FUC void assert_failed_log( const char* file, int line, const char* func, const char* expr, const char* msg_fmt, ...); +#ifndef _WIN32 #define custom_expect(expr, msg...) \ if (!(expr)) { \ assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \ @@ -33,8 +42,22 @@ void assert_failed_log( assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \ } \ assert((expr)) +#else +#define custom_expect(expr, ...) \ + if (!(expr)) { \ + assert_failed_log( \ + __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \ + } + +#define custom_assert(expr, ...) \ + if (!(expr)) { \ + assert_failed_log( \ + __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \ + } \ + assert((expr)) +#endif -class UnImpleWarnLog { +class MGE_WIN_DECLSPEC_FUC UnImpleWarnLog { public: UnImpleWarnLog( const std::string& func, const std::string& attr, const std::string& val); @@ -54,9 +77,9 @@ void impl_deleter(void* ptr) { std::unique_ptr m_impl; \ \ public: \ - MGE_WIN_DECLSPEC_FUC Cls(); \ - MGE_WIN_DECLSPEC_FUC Cls(const Cls& rhs); \ - MGE_WIN_DECLSPEC_FUC Cls& operator=(const Cls& rhs) + Cls(); \ + Cls(const Cls& rhs); \ + Cls& operator=(const Cls& rhs) #define CUSTOM_PIMPL_CLS_DEFINE(Cls) \ Cls::Cls() : m_impl(new Cls##Impl(), impl_deleter) {} \ diff --git a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp index d783a1f5..84d82e45 100644 --- a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp +++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp @@ -220,6 +220,28 @@ struct MultiAlgoOprTrait; ::megdnn::has_available_algo(megdnn_opr, args...), array_layouts); \ MIDOUT_E \ } \ + static bool has_no_naive_heuristic_algo( \ + const VarNodeArray& i, const cg::OperatorNodeBase* opr_) { \ + MIDOUT_B( \ + midout_iv(MGB_HASH_STR(#_Opr)), \ + midout_iv(MGB_HASH_STR("has_no_naive_heuristic_algo"))) \ + auto&& opr = opr_->cast_final_safe<_Opr>(); \ + auto&& megdnn_opr = reinterpret_cast(opr.megdnn_opr()); \ + FixedTensorLayouts array_layouts; \ + size_t in = i.size() - 1; \ + for (size_t idx = 0; idx < in; idx++) { \ + const auto& v = i[idx]; \ + array_layouts[idx] = \ + TensorLayout{v->shape(), v->dtype(), v->format()}; \ + } \ + const auto& v = i[in]; \ + array_layouts[arity - 1] = \ + TensorLayout{v->shape(), v->dtype(), v->format()}; \ + return APPLY( \ + ::megdnn::has_no_naive_heuristic_algo(megdnn_opr, args...), \ + array_layouts); \ + MIDOUT_E \ + } \ }; INST(Convolution) INST(ConvBiasForward) @@ -365,6 +387,23 @@ bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr) #undef cb } +bool has_no_naive_heuristic_algo( + const VarNodeArray& i, const cg::OperatorNodeBase* opr) { +#define cb(_Opr) \ + if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ + MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \ + VarNodeArray _ = i; \ + _.emplace_back(opr->output(0)); \ + return MultiAlgoOprTrait<_Opr>::has_no_naive_heuristic_algo(_, opr); \ + } else + cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward) { + mgb_throw( + InternalError, "invalid multi-algo operator(got:%s)", + opr->dyn_typeinfo()->name); + } +#undef cb +} + bool has_opr_format(const cg::OperatorNodeBase* opr) { bool ret = false; #define cb(_Opr) ret |= opr->dyn_typeinfo() == _Opr::typeinfo(); diff --git a/src/gopt/impl/global_layout_transform/opr_format_modifier.h b/src/gopt/impl/global_layout_transform/opr_format_modifier.h index 2ab6697a..77b1d292 100644 --- a/src/gopt/impl/global_layout_transform/opr_format_modifier.h +++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.h @@ -27,6 +27,9 @@ namespace intl { bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr); +bool has_no_naive_heuristic_algo( + const VarNodeArray& i, const cg::OperatorNodeBase* opr); + struct OprFormatInfo { opr::Convolution::Param::Format opr_format; struct TensorFormatsInfo { diff --git a/src/gopt/impl/global_layout_transform/profiler_impl.cpp b/src/gopt/impl/global_layout_transform/profiler_impl.cpp index 48d7188a..6257a24a 100644 --- a/src/gopt/impl/global_layout_transform/profiler_impl.cpp +++ b/src/gopt/impl/global_layout_transform/profiler_impl.cpp @@ -99,7 +99,7 @@ float GraphPartitionProfiler::duration_in_usec() const { * \brief An operator that indicates its input var node is contiguous */ // clang-format off -MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{ +MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) // { void scn_do_execute() override {}; void init_output_static_infer_desc() override; void add_input_layout_constraint() override { @@ -331,7 +331,8 @@ float ProfilerImpl::profile_operator( opr::PoolingForward::typeinfo(), }; if (multi_algo_oprs.count(opr->dyn_typeinfo()) && - !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr())) + (!mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()) || + !mgb::gopt::intl::has_no_naive_heuristic_algo(new_inps, y->owner_opr()))) return PROFILE_TIME_OUT; if (!m_opr_filter(opr, y->owner_opr())) return PROFILE_TIME_OUT; diff --git a/src/jit/test/mlir/CMakeLists.txt b/src/jit/test/mlir/CMakeLists.txt index aad1717d..8d1eff47 100644 --- a/src/jit/test/mlir/CMakeLists.txt +++ b/src/jit/test/mlir/CMakeLists.txt @@ -1,27 +1,20 @@ configure_lit_site_cfg( - ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.site.cfg.py.in - ${CMAKE_CURRENT_BINARY_DIR}/utils/lit.site.cfg.py - MAIN_CONFIG - ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.cfg.py -) + ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.site.cfg.py.in + ${CMAKE_CURRENT_BINARY_DIR}/utils/lit.site.cfg.py MAIN_CONFIG + ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.cfg.py) -set(LLVM_EXTERNAL_LIT "${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/utils/lit/lit.py" CACHE STRING "External lit") +set(LLVM_EXTERNAL_LIT + "${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/utils/lit/lit.py" + CACHE STRING "External lit") -set(MLIR_MGB_TEST_DEPENDS - mgb-file-check - count not - mgb-opt -) +set(MLIR_MGB_TEST_DEPENDS mgb-file-check count not mgb-opt) add_lit_testsuite(mgb-mlir-test-lit "Running the mgb regression tests" - ${CMAKE_CURRENT_BINARY_DIR}/utils - DEPENDS ${MLIR_MGB_TEST_DEPENDS} - ) + ${CMAKE_CURRENT_BINARY_DIR}/utils DEPENDS ${MLIR_MGB_TEST_DEPENDS}) set_target_properties(mgb-mlir-test-lit PROPERTIES FOLDER "Tests") -add_lit_testsuites(MLIR_TEST ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS ${MLIR_MGB_TEST_DEPENDS} -) +add_lit_testsuites(MLIR_TEST ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS + ${MLIR_MGB_TEST_DEPENDS}) add_custom_target(mlir_pass_check) add_dependencies(mlir_pass_check mgb-mlir-test-lit) diff --git a/src/opr-mm/impl/mm_handler.cpp b/src/opr-mm/impl/mm_handler.cpp index 1f13b697..7cbb8a3e 100644 --- a/src/opr-mm/impl/mm_handler.cpp +++ b/src/opr-mm/impl/mm_handler.cpp @@ -17,6 +17,9 @@ #include "megbrain/opr/zmq_rpc.h" #include "mm_handler.pb.h" +using namespace mgb; +using namespace opr; + /* ======================== GroupServerProxy ========================== */ /*! * A proxy that receives zmqrpc call, direct call to NCCL Manager @@ -213,7 +216,7 @@ struct ServerInfo { std::unique_ptr server; }; -int create_zmqrpc_server(const std::string& server_addr, int port) { +int mgb::opr::create_zmqrpc_server(const std::string& server_addr, int port) { static std::unordered_map addr2server; static std::mutex mtx; MGB_LOCK_GUARD(mtx); diff --git a/src/opr-mm/include/megbrain/opr/mm_handler.h b/src/opr-mm/include/megbrain/opr/mm_handler.h index 7c03bf96..97b829d4 100644 --- a/src/opr-mm/include/megbrain/opr/mm_handler.h +++ b/src/opr-mm/include/megbrain/opr/mm_handler.h @@ -16,8 +16,8 @@ #include "megbrain/opr/collective_comm.h" #include "megbrain/opr/group_manager.h" -using namespace mgb; -using namespace opr; +namespace mgb { +namespace opr { /*! * Comm MM Client Proxy. @@ -56,6 +56,9 @@ private: int create_zmqrpc_server(const std::string& server_addr, int port); +} // namespace opr +} // namespace mgb + #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/impl/dnn/dnn.sereg.h b/src/opr/impl/dnn/dnn.sereg.h index 4455bceb..ceb0c972 100644 --- a/src/opr/impl/dnn/dnn.sereg.h +++ b/src/opr/impl/dnn/dnn.sereg.h @@ -16,6 +16,7 @@ #include "megbrain/opr/dnn/correlation.h" #include "megbrain/opr/dnn/fake_quant.h" #include "megbrain/opr/dnn/images2neibs.h" +#include "megbrain/opr/dnn/layer_norm.h" #include "megbrain/opr/dnn/local.h" #include "megbrain/opr/dnn/lrn.h" #include "megbrain/opr/dnn/lsq.h" @@ -420,6 +421,47 @@ struct OprMaker { } }; +template <> +struct OprMaker { + using Param = opr::LayerNorm::Param; + static cg::OperatorNodeBase* make( + const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph, + const OperatorNodeConfig& config) { + MGB_MARK_USED_VAR(graph); + if (i.size() == 3) { + return opr::LayerNorm::make(i[0], i[1], i[2], param, config)[0] + .node() + ->owner_opr(); + } else { + mgb_assert(i.size() == 1); + return opr::LayerNorm::make(i[0], param, config)[0].node()->owner_opr(); + } + } +}; + +// OprMaker in MGB_SEREG_OPR only support unique output opr +template <> +struct OprMaker { + using Param = opr::LayerNormBackward::Param; + static cg::OperatorNodeBase* make( + const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph, + const OperatorNodeConfig& config) { + MGB_MARK_USED_VAR(graph); + if (i.size() == 5) { + return opr::LayerNormBackward::make( + i[0], i[1], i[2], i[3], i[4], param, config)[0] + .node() + ->owner_opr(); + } else { + mgb_assert(i.size() == 4); + return opr::LayerNormBackward::make( + i[0], i[1], i[2], i[3], param, config)[0] + .node() + ->owner_opr(); + } + } +}; + template struct MakeLocalShareCaller2 { template @@ -641,6 +683,8 @@ MGB_SEREG_OPR(TQT, 2); MGB_SEREG_OPR(TQTBackward, 3); MGB_SEREG_OPR(LSQ, 4); MGB_SEREG_OPR(LSQBackward, 5); +MGB_SEREG_OPR(LayerNorm, 0); +MGB_SEREG_OPR(LayerNormBackward, 0); } // namespace opr } // namespace mgb diff --git a/src/opr/impl/dnn/layer_norm.cpp b/src/opr/impl/dnn/layer_norm.cpp new file mode 100644 index 00000000..3506111a --- /dev/null +++ b/src/opr/impl/dnn/layer_norm.cpp @@ -0,0 +1,248 @@ +/** + * \file src/opr/impl/dnn/layer_norm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "megbrain/opr/dnn/layer_norm.h" + +#include "megbrain/graph/grad_impl.h" +#include "megbrain/opr/internal/out_shape_by_sym_var.h" +#include "megbrain/opr/utility.h" + +#include "../internal/megdnn_opr_wrapper.inl" + +using namespace mgb; +using namespace opr; + +/* ==================== LayerNormForward ==================== */ +MGB_DYN_TYPE_OBJ_FINAL_IMPL(LayerNormForward); + +LayerNormForward::LayerNormForward( + VarNode* data, VarNode* weight, VarNode* bias, const Param& param, + const OperatorNodeConfig& config) + : Super{data->owner_graph(), config, "layer_norm", {data, weight, bias}} { + init_megdnn_opr(*this, param); + + add_input({data, weight, bias}); + output(0)->dtype(data->dtype()); + output(1)->dtype(dtype::Float32()); + output(2)->dtype(dtype::Float32()); +} + +LayerNormForward::LayerNormForward( + VarNode* data, const Param& param, const OperatorNodeConfig& config) + : Super{data->owner_graph(), config, "layer_norm", {data}} { + init_megdnn_opr(*this, param); + + add_input({data}); + output(0)->dtype(data->dtype()); + output(1)->dtype(dtype::Float32()); + output(2)->dtype(dtype::Float32()); +} + +SymbolVarArray LayerNormForward::make( + SymbolVar data, SymbolVar weight, SymbolVar bias, const Param& param, + const OperatorNodeConfig& config) { + auto outs = data.node() + ->owner_graph() + ->insert_opr(std::make_unique( + data.node(), weight.node(), bias.node(), param, config)) + ->output(); + SymbolVarArray ret; + for (auto&& out : outs) { + ret.emplace_back(out); + } + return ret; +} + +SymbolVarArray LayerNormForward::make( + SymbolVar data, const Param& param, const OperatorNodeConfig& config) { + auto outs = data.node() + ->owner_graph() + ->insert_opr(std::make_unique( + data.node(), param, config)) + ->output(); + SymbolVarArray ret; + for (auto&& out : outs) { + ret.emplace_back(out); + } + return ret; +} + +void LayerNormForward::get_output_var_shape( + const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { + uint64_t normalized_dim = param().normalized_dim; + out_shape[0] = inp_shape[0]; + TensorShape unnormalized_shape; + unnormalized_shape.ndim = inp_shape[0].ndim - normalized_dim; + for (size_t i = 0; i < unnormalized_shape.ndim; ++i) { + unnormalized_shape.shape[i] = inp_shape[0].shape[i]; + } + out_shape[1] = unnormalized_shape; + out_shape[2] = unnormalized_shape; +} + +size_t LayerNormForward::get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const { + return 0; +} + +void LayerNormForward::scn_do_execute() { + if (param().affine) { + megdnn_opr()->exec( + input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(), + input(2)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(), + output(1)->dev_tensor().as_megdnn(), + output(2)->dev_tensor().as_megdnn(), {}); + } else { + megdnn_opr()->exec( + input(0)->dev_tensor().as_megdnn(), {}, {}, + output(0)->dev_tensor().as_megdnn(), + output(1)->dev_tensor().as_megdnn(), + output(2)->dev_tensor().as_megdnn(), {}); + } +} + +#if MGB_ENABLE_GRAD +MGB_IMPL_OPR_GRAD(LayerNormForward) { + auto p = opr.param(); + SymbolVarArray grad; + VarNodeArray ret; + if (p.affine) { + mgb_assert(wrt_idx < 3, "wrt_idx %zu is out of range", wrt_idx); + grad = LayerNormBackward::make( + out_grad[0], opr.input(0), opr.input(1), opr.output(1), opr.output(2), + opr.param()); + } else { + mgb_assert(wrt_idx < 1, "wrt_idx %zu is out of range", wrt_idx); + grad = LayerNormBackward::make( + out_grad[0], opr.input(0), opr.output(1), opr.output(2), opr.param()); + } + + uint32_t nr_ret = p.affine ? 3 : 1; + for (uint32_t i = 0; i < nr_ret; ++i) { + ret.push_back(grad[i].node()); + } + return ret; +} +#endif + +/* ==================== LayerNormBackward ==================== */ +MGB_DYN_TYPE_OBJ_FINAL_IMPL(LayerNormBackward); + +LayerNormBackward::LayerNormBackward( + VarNode* diff, VarNode* data, VarNode* weight, VarNode* mean, VarNode* rstd, + const Param& param, const OperatorNodeConfig& config) + : Super({diff->owner_graph(), + config, + "layer_norm_backward", + {diff, data, weight, mean, rstd}}, + 0, true) { + init_megdnn_opr(*this, param); + add_input({diff, data, weight, mean, rstd}); +} + +LayerNormBackward::LayerNormBackward( + VarNode* diff, VarNode* data, VarNode* mean, VarNode* rstd, const Param& param, + const OperatorNodeConfig& config) + : Super({diff->owner_graph(), + config, + "layer_norm_backward", + {diff, data, mean, rstd}}, + 0, true) { + init_megdnn_opr(*this, param); + add_input({diff, data, mean, rstd}); + auto mark_empty_var = [&](VarNode* var) { + var->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE) + .add_flag(VarNode::Flag::VOLATILE_CONTENT); + }; + mark_empty_var(output(1)); + mark_empty_var(output(2)); +} + +SymbolVarArray LayerNormBackward::make( + SymbolVar diff, SymbolVar data, SymbolVar weight, SymbolVar mean, + SymbolVar rstd, const Param& param, const OperatorNodeConfig& config) { + auto outs = diff.node() + ->owner_graph() + ->insert_opr(std::make_unique( + diff.node(), data.node(), weight.node(), mean.node(), + rstd.node(), param, config)) + ->output(); + SymbolVarArray ret; + for (auto&& out : outs) { + ret.emplace_back(out); + } + return ret; +} + +SymbolVarArray LayerNormBackward::make( + SymbolVar diff, SymbolVar data, SymbolVar mean, SymbolVar rstd, + const Param& param, const OperatorNodeConfig& config) { + auto outs = diff.node() + ->owner_graph() + ->insert_opr(std::make_unique( + diff.node(), data.node(), mean.node(), rstd.node(), + param, config)) + ->output(); + SymbolVarArray ret; + for (auto&& out : outs) { + ret.emplace_back(out); + } + return ret; +} + +void LayerNormBackward::init_output_static_infer_desc() { + using namespace cg::static_infer; + auto&& mgr = owner_graph()->static_infer_manager(); + mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(1))); + if (param().affine) { + mgr.register_shape_infer(output(1), ShapeInferDesc::make_identity(input(2))); + mgr.register_shape_infer(output(2), ShapeInferDesc::make_identity(input(2))); + } else { + TensorShape empty; + empty.ndim = 0; + mgr.register_shape_infer(output(1), ShapeInferDesc::make_const(empty)); + mgr.register_shape_infer(output(2), ShapeInferDesc::make_const(empty)); + } + this->init_output_static_infer_desc_workspace(false); +} + +void LayerNormBackward::init_output_dtype() { + output(0)->dtype(input(1)->dtype()); + output(1)->dtype(input(2)->dtype()); + output(2)->dtype(input(2)->dtype()); +} + +size_t LayerNormBackward::get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const { + return 0; +} + +void LayerNormBackward::scn_do_execute() { + if (param().affine) { + megdnn_opr()->exec( + input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(), + input(2)->dev_tensor().as_megdnn(), input(3)->dev_tensor().as_megdnn(), + input(4)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(), + output(1)->dev_tensor().as_megdnn(), + output(2)->dev_tensor().as_megdnn(), {}); + } else { + megdnn_opr()->exec( + input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(), + {}, input(2)->dev_tensor().as_megdnn(), + input(3)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(), + {}, {}, {}); + } +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/impl/rand.cpp b/src/opr/impl/rand.cpp index 6be879f1..043a15e5 100644 --- a/src/opr/impl/rand.cpp +++ b/src/opr/impl/rand.cpp @@ -201,6 +201,8 @@ template class RNGOprBase<::megdnn::BetaRNG>; template class RNGOprBase<::megdnn::PoissonRNG>; template class RNGOprBase<::megdnn::ShuffleRNGForward>; template class RNGOprBase<::megdnn::ShuffleRNGBackward>; +template class RNGOprBase<::megdnn::DropoutForward>; +template class RNGOprBase<::megdnn::DropoutBackward>; #if MGB_ENABLE_GRAD IMPL(GaussianRNG); IMPL(UniformRNG); @@ -300,4 +302,134 @@ MGB_IMPL_OPR_GRAD(ShuffleRNGForward) { MGB_DYN_TYPE_OBJ_FINAL_IMPL(ShuffleRNGBackward); MEGDNN_OPR_INIT3(ShuffleRNGBackward, "shuffle_rng_bwd", 2, true) +/* ================= DropoutForward ================= */ + +MGB_DYN_TYPE_OBJ_FINAL_IMPL(DropoutForward); + +DropoutForward::DropoutForward( + VarNode* inp, const Param& param, const OperatorNodeConfig& config) + : Super({inp->owner_graph(), config, "dropout", {inp}}, param) { + add_input({inp}); + add_output(None)->dtype(inp->dtype()).add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE); + add_output(None)->dtype(dtype::Byte()).add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE); + cg::add_workspace_output(this); + add_equivalence_component>(this); +} + +SymbolVarArray DropoutForward::make( + SymbolVar inp, const Param& param, const OperatorNodeConfig& config) { + auto node = inp.node()->owner_graph()->insert_opr( + std::make_unique(inp.node(), param, config)); + mgb_assert(node->output().size() == 3); + return {node->output(0), node->output(1)}; +} + +void DropoutForward::init_output_static_infer_desc() { + using namespace cg::static_infer; + auto&& mgr = owner_graph()->static_infer_manager(); + mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(0))); + + auto infer_mask = [this](TensorShape& dest, const InpVal& iv) { + ensure_megdnn_opr(); + dest.ndim = 1; + dest.shape[0] = m_dnn_opr->get_mask_size_in_bytes( + {iv.val[0].shape(), input(0)->dtype()}); + return true; + }; + mgr.register_shape_infer( + output(1), {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_mask}); + + auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { + ensure_megdnn_opr(); + dest.ndim = 1; + dest.shape[0] = m_dnn_opr->get_workspace_in_bytes( + {inp.val[0].shape(), input(0)->dtype()}, + {output(0)->shape(), output(0)->dtype()}, + {output(1)->shape(), output(1)->dtype()}); + return true; + }; + mgr.register_shape_infer( + output(2), {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_wk}); +} + +void DropoutForward::add_input_layout_constraint() { + input(0)->add_layout_constraint_contiguous(); +}; + +void DropoutForward::scn_do_execute() { + auto&& ret = output(0); + if (ret->layout().is_empty()) { + mgb_assert(ret->dev_tensor().empty()); + return; + } + m_dnn_opr->exec( + input(0)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(), + output(1)->dev_tensor().as_megdnn(), + get_megdnn_workspace_from_var(output(2))); +} + +cg::OperatorNodeBase::NodeProp* DropoutForward::do_make_node_prop() const { + auto prop = Super::do_make_node_prop(); + prop->add_flag(NodeProp::Flag::IMPURE_FUNC); + for (auto i : input()) { + prop->add_dep_type_existing_var(i, NodeProp::DepType::VALUE_ALLOW_EMPTY); + } + return prop; +} + +#if MGB_ENABLE_GRAD +MGB_IMPL_OPR_GRAD(DropoutForward) { + SymbolVar grad = DropoutBackward::make(out_grad[0], opr.output(1), opr.param()); + VarNodeArray ret; + ret.push_back(grad.node()); + return ret; +} +#endif + +/* ==================== LayerNormBackward ==================== */ + +MGB_DYN_TYPE_OBJ_FINAL_IMPL(DropoutBackward); + +DropoutBackward::DropoutBackward( + VarNode* doup, VarNode* mask, const Param& param, + const OperatorNodeConfig& config) + : Super({doup->owner_graph(), config, "dropout_backward", {doup, mask}}, 0, + true) { + init_megdnn_opr(*this, param); + add_input({doup, mask}); +} + +SymbolVar DropoutBackward::make( + SymbolVar doup, SymbolVar mask, const Param& param, + const OperatorNodeConfig& config) { + return doup.insert_single_output_opr( + doup.node(), mask.node(), param, config); +} + +void DropoutBackward::init_output_static_infer_desc() { + using namespace cg::static_infer; + auto&& mgr = owner_graph()->static_infer_manager(); + mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(0))); + this->init_output_static_infer_desc_workspace(false); +} + +void DropoutBackward::init_output_dtype() { + output(0)->dtype(input(0)->dtype()); +} + +size_t DropoutBackward::get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const { + return megdnn_opr()->get_workspace_in_bytes( + {input_shapes[0], input(0)->dtype(), input(0)->format()}, + {input_shapes[1], input(1)->dtype(), input(1)->format()}, + {output_shapes[0], output(0)->dtype(), output(0)->format()}); +} + +void DropoutBackward::scn_do_execute() { + megdnn_opr()->exec( + input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(), + output(0)->dev_tensor().as_megdnn(), {}); +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/impl/rand.sereg.h b/src/opr/impl/rand.sereg.h index fe3bd8b1..869fb72c 100644 --- a/src/opr/impl/rand.sereg.h +++ b/src/opr/impl/rand.sereg.h @@ -29,6 +29,19 @@ struct OprMaker { return out[0].node()->owner_opr(); } }; + +// OprMaker in MGB_SEREG_OPR only support unique output opr +template <> +struct OprMaker { + using Param = opr::DropoutForward::Param; + static cg::OperatorNodeBase* make( + const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph, + const OperatorNodeConfig& config) { + MGB_MARK_USED_VAR(graph); + return opr::DropoutForward::make(i[0], param, config)[0].node()->owner_opr(); + } +}; + } // namespace serialization namespace opr { @@ -43,6 +56,8 @@ MGB_SEREG_OPR(PermutationRNG, 1); MGB_SEREG_OPR(BetaRNG, 2); MGB_SEREG_OPR(ShuffleRNG, 1); MGB_SEREG_OPR(ShuffleRNGBackward, 3); +MGB_SEREG_OPR(Dropout, 1); +MGB_SEREG_OPR(DropoutBackward, 2); } // namespace opr } // namespace mgb diff --git a/src/opr/include/megbrain/opr/dnn/layer_norm.h b/src/opr/include/megbrain/opr/dnn/layer_norm.h new file mode 100644 index 00000000..29712de0 --- /dev/null +++ b/src/opr/include/megbrain/opr/dnn/layer_norm.h @@ -0,0 +1,78 @@ +/** + * \file src/opr/include/megbrain/opr/dnn/layer_norm.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megbrain/opr/internal/megdnn_opr_wrapper.h" +#include "megdnn/oprs.h" + +namespace mgb { +namespace opr { + +MGB_DEFINE_OPR_CLASS_WITH_EXPORT( + LayerNormForward, intl::MegDNNOprWrapperFwd) // { +public: + MGE_WIN_DECLSPEC_FUC LayerNormForward( + VarNode* data, VarNode* weight, VarNode* bias, const Param& param, + const OperatorNodeConfig& config); + MGE_WIN_DECLSPEC_FUC LayerNormForward( + VarNode* data, const Param& param, const OperatorNodeConfig& config); + + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( + SymbolVar data, SymbolVar weight, SymbolVar bias, const Param& param = {}, + const OperatorNodeConfig& config = {}); + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( + SymbolVar data, const Param& param = {}, + const OperatorNodeConfig& config = {}); + +private: + void get_output_var_shape( + const TensorShapeArray& inp_shape, + TensorShapeArray& out_shape) const override; + size_t get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const override; + void scn_do_execute() override; +}; +using LayerNorm = LayerNormForward; + +MGB_DEFINE_OPR_CLASS_WITH_EXPORT( + LayerNormBackward, intl::MegDNNOprWrapperBwd) // { +public: + MGE_WIN_DECLSPEC_FUC LayerNormBackward( + VarNode* diff, VarNode* data, VarNode* weight, VarNode* mean, VarNode* rstd, + const Param& param, const OperatorNodeConfig& config); + + MGE_WIN_DECLSPEC_FUC LayerNormBackward( + VarNode* diff, VarNode* data, VarNode* mean, VarNode* rstd, + const Param& param, const OperatorNodeConfig& config); + + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( + SymbolVar diff, SymbolVar data, SymbolVar weight, SymbolVar mean, + SymbolVar rstd, const Param& param = {}, + const OperatorNodeConfig& config = {}); + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( + SymbolVar diff, SymbolVar data, SymbolVar mean, SymbolVar rstd, + const Param& param = {}, const OperatorNodeConfig& config = {}); + +private: + void init_output_static_infer_desc() override; + void init_output_dtype() override; + size_t get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const override; + void scn_do_execute() override; +}; + +} // namespace opr +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/include/megbrain/opr/dnn/pooling.h b/src/opr/include/megbrain/opr/dnn/pooling.h index 658f13f6..3b4efdde 100644 --- a/src/opr/include/megbrain/opr/dnn/pooling.h +++ b/src/opr/include/megbrain/opr/dnn/pooling.h @@ -20,38 +20,38 @@ namespace opr { MGB_DEFINE_OPR_CLASS( PoolingForward, intl::MegDNNOprWrapperFwd, - public mixin::AlgoChooserHelper) //{ + public mixin::AlgoChooserHelper) // { public: -MGE_WIN_DECLSPEC_FUC PoolingForward( - VarNode* src, const Param& param, const ExecutionPolicy& policy, - const OperatorNodeConfig& config); -MGE_WIN_DECLSPEC_FUC static SymbolVar make( - SymbolVar src, const Param& param, const ExecutionPolicy& policy = {}, - const OperatorNodeConfig& config = {}); - -void init_output_static_infer_desc() override; - -size_t get_workspace_size_bytes( - const TensorShapeArray& input_shapes, - const TensorShapeArray& output_shapes) const override; + MGE_WIN_DECLSPEC_FUC PoolingForward( + VarNode* src, const Param& param, const ExecutionPolicy& policy, + const OperatorNodeConfig& config); + MGE_WIN_DECLSPEC_FUC static SymbolVar make( + SymbolVar src, const Param& param, const ExecutionPolicy& policy = {}, + const OperatorNodeConfig& config = {}); + + void init_output_static_infer_desc() override; + + size_t get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const override; }; using Pooling = PoolingForward; MGB_DEFINE_OPR_CLASS( PoolingBackward, intl::MegDNNOprWrapperBwd, - public mixin::AlgoChooserHelper) //{ + public mixin::AlgoChooserHelper) // { public: -MGE_WIN_DECLSPEC_FUC PoolingBackward( - VarNode* src, VarNode* dst, VarNode* diff, const Param& param, - const ExecutionPolicy& policy, const OperatorNodeConfig& config); + MGE_WIN_DECLSPEC_FUC PoolingBackward( + VarNode* src, VarNode* dst, VarNode* diff, const Param& param, + const ExecutionPolicy& policy, const OperatorNodeConfig& config); -MGE_WIN_DECLSPEC_FUC static SymbolVar make( - SymbolVar src, SymbolVar dst, SymbolVar diff, const Param& param, - const ExecutionPolicy& policy = {}, const OperatorNodeConfig& config = {}); + MGE_WIN_DECLSPEC_FUC static SymbolVar make( + SymbolVar src, SymbolVar dst, SymbolVar diff, const Param& param, + const ExecutionPolicy& policy = {}, const OperatorNodeConfig& config = {}); -MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes( - const TensorShapeArray& input_shapes, - const TensorShapeArray& output_shapes) const override final; + MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const override final; }; } // namespace opr diff --git a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h index c4fa8725..7491054c 100644 --- a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h +++ b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h @@ -86,7 +86,7 @@ MGE_WIN_DECLSPEC_FUC void add_input_layout_constraint_contig(OperatorNodeBase& o //! called in constructor to add output vars MGE_WIN_DECLSPEC_FUC void add_output_vars( OperatorNodeBase& opr, size_t nr_output, bool add_workspace); -} +} // namespace megdnn_utils /*! * \brief mixin for infer workspace size based on input and output shapes @@ -344,34 +344,34 @@ private: } // namespace mgb //! define a megdnn opr wrapper class with 1 input for forward -#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD1(_name) \ - MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd) \ -public: \ - _name(VarNode* p0, const Param& param, const OperatorNodeConfig& config); \ - MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ - SymbolVar p0, const Param& param = {}, \ - const OperatorNodeConfig& config = {}); \ +#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD1(_name) \ + MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd) \ + public: \ + _name(VarNode* p0, const Param& param, const OperatorNodeConfig& config); \ + MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ + SymbolVar p0, const Param& param = {}, \ + const OperatorNodeConfig& config = {}); \ } //! define a megdnn opr wrapper class with 2 inputs for forward -#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD2(_name) \ - MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd) \ -public: \ - _name(VarNode* p0, VarNode* p1, const Param& param, \ - const OperatorNodeConfig& config); \ - MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ - SymbolVar p0, SymbolVar p1, const Param& param = {}, \ - const OperatorNodeConfig& config = {}); \ +#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD2(_name) \ + MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd) \ + public: \ + _name(VarNode* p0, VarNode* p1, const Param& param, \ + const OperatorNodeConfig& config); \ + MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ + SymbolVar p0, SymbolVar p1, const Param& param = {}, \ + const OperatorNodeConfig& config = {}); \ } //! define a megdnn opr wrapper class with 3 inputs for grad #define MGB_DEFINE_MEGDNN_OPR_WRAPPER_BWD3(_name, _extra...) \ MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperBwd) \ - _extra public : _name(VarNode* p0, VarNode* p1, VarNode* p2, const Param& param, \ - const OperatorNodeConfig& config); \ - MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ - SymbolVar p0, SymbolVar p1, SymbolVar p2, const Param& param = {}, \ - const OperatorNodeConfig& config = {}); \ + _extra public : _name(VarNode* p0, VarNode* p1, VarNode* p2, \ + const Param& param, const OperatorNodeConfig& config); \ + MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ + SymbolVar p0, SymbolVar p1, SymbolVar p2, const Param& param = {}, \ + const OperatorNodeConfig& config = {}); \ } // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/include/megbrain/opr/rand.h b/src/opr/include/megbrain/opr/rand.h index 1424dca7..e7199ccf 100644 --- a/src/opr/include/megbrain/opr/rand.h +++ b/src/opr/include/megbrain/opr/rand.h @@ -40,25 +40,25 @@ protected: }; /* ================= RNG with shape ================= */ -#define _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(RNG) \ - MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase) \ - cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override; \ - \ -public: \ - RNG(VarNode* shape, const Param& param, const OperatorNodeConfig& config); \ - MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ - SymbolVar shape, const Param& param = {}, \ - const OperatorNodeConfig& config = {}); \ - static SymbolVar make( \ - ComputingGraph& graph, const TensorShape& shape, \ - const OperatorNodeConfig& config, const Param& param = {}) { \ - return make( \ - var_from_tensor_shape(graph, config, "rng", shape), param, config); \ - } \ - void init_output_static_infer_desc() override; \ - void scn_do_execute() override; \ - } \ - ; +#define _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(RNG) \ + MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase) \ + cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override; \ + \ + public: \ + RNG(VarNode* shape, const Param& param, const OperatorNodeConfig& config); \ + MGE_WIN_DECLSPEC_FUC static SymbolVar make( \ + SymbolVar shape, const Param& param = {}, \ + const OperatorNodeConfig& config = {}); \ + static SymbolVar make( \ + ComputingGraph& graph, const TensorShape& shape, \ + const OperatorNodeConfig& config, const Param& param = {}) { \ + return make( \ + var_from_tensor_shape(graph, config, "rng", shape), param, \ + config); \ + } \ + void init_output_static_infer_desc() override; \ + void scn_do_execute() override; \ + }; _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(UniformRNG) _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(GaussianRNG) @@ -66,20 +66,19 @@ _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(PermutationRNG) #undef _DEFINE_RNG_OPR_WITH_SHAPE_CLASS /* ================= RNG with input ================= */ -#define _DEFINE_RNG_OPR_WITH_INPUT_CLASS(RNG) \ - MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase) \ - void add_input_layout_constraint() override; \ - cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override; \ - \ -public: \ - RNG(_INPUTS(VarNode*), const Param& param, const OperatorNodeConfig& config); \ - MGE_WIN_DECLSPEC_FUC static _OUTPUTS make( \ - _INPUTS(SymbolVar), const Param& param = {}, \ - const OperatorNodeConfig& config = {}); \ - void init_output_static_infer_desc() override; \ - void scn_do_execute() override; \ - } \ - ; +#define _DEFINE_RNG_OPR_WITH_INPUT_CLASS(RNG) \ + MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase) \ + void add_input_layout_constraint() override; \ + cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override; \ + \ + public: \ + RNG(_INPUTS(VarNode*), const Param& param, const OperatorNodeConfig& config); \ + MGE_WIN_DECLSPEC_FUC static _OUTPUTS make( \ + _INPUTS(SymbolVar), const Param& param = {}, \ + const OperatorNodeConfig& config = {}); \ + void init_output_static_infer_desc() override; \ + void scn_do_execute() override; \ + }; /* ================= 1 input ================= */ #define _INPUTS(preifx) preifx i0 @@ -88,6 +87,7 @@ _DEFINE_RNG_OPR_WITH_INPUT_CLASS(PoissonRNG) #undef _OUTPUTS #define _OUTPUTS SymbolVarArray _DEFINE_RNG_OPR_WITH_INPUT_CLASS(ShuffleRNGForward) +_DEFINE_RNG_OPR_WITH_INPUT_CLASS(DropoutForward) #undef _OUTPUTS #undef _INPUTS @@ -100,7 +100,7 @@ _DEFINE_RNG_OPR_WITH_INPUT_CLASS(GammaRNG) #undef _INPUTS #undef _DEFINE_RNG_OPR_WITH_INPUT_CLASS -} // intl +} // namespace intl using UniformRNG = intl::UniformRNG; using GaussianRNG = intl::GaussianRNG; @@ -109,18 +109,39 @@ using PermutationRNG = intl::PermutationRNG; using PoissonRNG = intl::PoissonRNG; using BetaRNG = intl::BetaRNG; using ShuffleRNG = intl::ShuffleRNGForward; +using Dropout = intl::DropoutForward; +using DropoutForward = intl::DropoutForward; MGB_DEFINE_OPR_CLASS_WITH_EXPORT( - ShuffleRNGBackward, - intl::MegDNNOprWrapperBwd) //{ + ShuffleRNGBackward, intl::MegDNNOprWrapperBwd) // { public: -ShuffleRNGBackward( - VarNode* out_diff, VarNode* indices, VarNode* result_shape, const Param& param, - const OperatorNodeConfig& config); + ShuffleRNGBackward( + VarNode* out_diff, VarNode* indices, VarNode* result_shape, + const Param& param, const OperatorNodeConfig& config); -MGE_WIN_DECLSPEC_FUC static SymbolVar make( - SymbolVar out_diff, SymbolVar indices, SymbolVar result_shape, - const Param& param = {}, const OperatorNodeConfig& config = {}); + MGE_WIN_DECLSPEC_FUC static SymbolVar make( + SymbolVar out_diff, SymbolVar indices, SymbolVar result_shape, + const Param& param = {}, const OperatorNodeConfig& config = {}); +}; + +MGB_DEFINE_OPR_CLASS_WITH_EXPORT( + DropoutBackward, intl::MegDNNOprWrapperBwd) // { +public: + MGE_WIN_DECLSPEC_FUC DropoutBackward( + VarNode* doup, VarNode* mask, const Param& param, + const OperatorNodeConfig& config); + + MGE_WIN_DECLSPEC_FUC static SymbolVar make( + SymbolVar doup, SymbolVar mask, const Param& param = {}, + const OperatorNodeConfig& config = {}); + +private: + void init_output_static_infer_desc() override; + void init_output_dtype() override; + size_t get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const override; + void scn_do_execute() override; }; } // namespace opr diff --git a/src/opr/test/dnn/layer_norm.cpp b/src/opr/test/dnn/layer_norm.cpp new file mode 100644 index 00000000..15db672c --- /dev/null +++ b/src/opr/test/dnn/layer_norm.cpp @@ -0,0 +1,108 @@ +/** + * \file src/opr/test/dnn/layer_norm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "megbrain/opr/dnn/layer_norm.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/test/autocheck.h" +#include "megbrain/test/helper.h" +#include "megbrain/test/megdnn_helper.h" + +#include "megdnn/oprs.h" + +#include +#include +#include +#include + +using namespace mgb; + +namespace { +using Param = opr::LayerNormForward::Param; + +void run_forward(bool is_affine, size_t normalized_size) { + using Checker = AutoOprChecker<3, 3>; + + Param param; + param.eps = 1e-5; + param.affine = is_affine; + param.normalized_dim = 1; + param.normalized_size = normalized_size; + + auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray { + auto out = opr::LayerNormForward::make(inputs[0], inputs[1], inputs[2], param); + return {out[0], out[1], out[2]}; + }; + + auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { + auto opr = + MegDNNHandle::get(CompNodeEnv::from_comp_node(CompNode::default_cpu())) + ->create_operator(); + auto inp_shape = inp[0]->shape(); + auto n_slices = inp_shape[0]; + auto slice_len = inp_shape[1]; + + opr->param() = param; + + dest[0].dtype(dtype::Float32()) + .comp_node(inp[0]->comp_node()) + .resize({n_slices, slice_len}); + dest[1].dtype(dtype::Float32()) + .comp_node(inp[0]->comp_node()) + .resize({n_slices}); + dest[2].dtype(dtype::Float32()) + .comp_node(inp[0]->comp_node()) + .resize({n_slices}); + opr->exec( + inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(), + dest[0].as_megdnn(), dest[1].as_megdnn(), dest[2].as_megdnn(), {}); + }; + + auto gen = [&](HostTensorND& src) { + HostTensorGenerator src_gen(0.f); + src = *src_gen(src.shape(), src.comp_node()); + }; + + Checker::RunOptions option; + option.numdiff_max_err = 1e-4; + Checker checker{make_graph, fwd}; + + checker.set_input_generator(0, gen); + checker.set_input_generator(1, gen); + checker.set_input_generator(2, gen); + checker.set_input_allow_grad(0, false); + checker.set_input_allow_grad(1, false); + checker.set_input_allow_grad(2, false); + checker.set_output_allow_grad(0, false); + checker.set_output_allow_grad(1, false); + checker.set_output_allow_grad(2, false); + + checker.run({TensorShape{normalized_size, normalized_size}, + TensorShape{normalized_size}, TensorShape{normalized_size}}, + option) + .run({TensorShape{normalized_size, normalized_size}, + TensorShape{normalized_size}, TensorShape{normalized_size}}, + option) + .run({TensorShape{normalized_size, normalized_size}, + TensorShape{normalized_size}, TensorShape{normalized_size}}, + option); +} + +TEST(TestOprDNN, LayerNormForwardAffine) { + REQUIRE_GPU(1); + run_forward(true, 1); + run_forward(true, 16); + run_forward(true, 17); +} + +} // anonymous namespace + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/test/rand.cpp b/src/opr/test/rand.cpp index b584629a..a7171a19 100644 --- a/src/opr/test/rand.cpp +++ b/src/opr/test/rand.cpp @@ -446,4 +446,42 @@ TEST(TestOprRand, PermutationReprod) { }); } +TEST(TestOprRand, Dropout) { + auto run = [&](TensorShape shape, uint64_t seed, float drop_prob) { + using Param = megdnn::DropoutBase::Param; + Param param(drop_prob, seed); + float scale = 1.0 / (1.0 - drop_prob); + + std::shared_ptr inp_host( + new HostTensorND{CompNode::load("xpux"), shape, dtype::Float32()}); + for (size_t i = 0; i < shape.total_nr_elems(); ++i) { + inp_host->ptr()[i] = 1.0f; + } + + auto graph = ComputingGraph::make(); + auto inp_sym = opr::Host2DeviceCopy::make(*graph, inp_host); + auto outs = opr::DropoutForward::make(inp_sym, param); + + HostTensorND oup_host, mask_host, ws_host; + auto func = graph->compile( + {make_callback_copy(outs[0], oup_host), + make_callback_copy(outs[1], mask_host)}); + func->execute(); + + size_t droped_cnt = 0; + for (size_t i = 0; i < shape.total_nr_elems(); ++i) { + ASSERT_TRUE( + oup_host.ptr()[i] == 0 || + oup_host.ptr()[i] == scale); + if (oup_host.ptr()[i] == 0) { + droped_cnt++; + } + } + float real_drop = droped_cnt * 1.0 / shape.total_nr_elems(); + ASSERT_LT(abs(drop_prob - real_drop), 1e-2); + }; + run({100000}, 0, 0.2); + run({64, 32, 16, 16}, 1, 0.4); +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/serialization/impl/schema.fbs b/src/serialization/impl/schema.fbs index f91477e6..7b3aa4ed 100644 --- a/src/serialization/impl/schema.fbs +++ b/src/serialization/impl/schema.fbs @@ -116,6 +116,8 @@ union OperatorParam { param.Padding = 82, param.ShuffleRNG = 83, param.CheckNonFinite = 84, + param.LayerNorm = 85, + param.Dropout = 86, } table Operator { diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp index ef232d8c..822116bb 100644 --- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp +++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp @@ -107,6 +107,7 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr( void TensorRTRuntimeOpr::get_output_var_shape( const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { auto batch = inp_shape.at(0)[0]; + m_manager.clear_trt_context(); m_manager.create_trt_context(this->comp_node(), inp_shape, m_engine.get()); auto get_mgb_shape = [&](int binding_idx) -> TensorShape { auto dims = m_engine->getBindingDimensions(binding_idx); @@ -217,6 +218,12 @@ SymbolVarArray TensorRTRuntimeOpr::make( std::shared_ptr engine, std::shared_ptr gpu_allocator, const SymbolVarArray& src, const OperatorNodeConfig& config) { + mgb_assert( + NV_TENSORRT_VERSION == getInferLibVersion(), + "TensorRT version mismatch: compiled with %d; detected %d at runtime , may " + "caused by customized environment, for example LD_LIBRARY_PATH on LINUX " + "and PATH on Windows!!", + NV_TENSORRT_VERSION, getInferLibVersion()); VarNodeArray var_node_array = cg::to_var_node_array(src); auto tensor_rt_opr = std::make_unique( std::move(engine), std::move(gpu_allocator), var_node_array, config); diff --git a/src/version.ld b/src/version.ld index db71a72b..f70a5677 100644 --- a/src/version.ld +++ b/src/version.ld @@ -13,8 +13,6 @@ global: base_exceptions*; }; megcore*; - *GroupClientProxy*; - *create_zmqrpc_server*; *custom*; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index dd4b2c7e..030e5e6e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,52 +1,63 @@ include_directories("./src/include") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") -file(GLOB_RECURSE SOURCES ./*.cpp ../src/core/test/*.cpp ../src/gopt/test/*.cpp ../src/opr/test/*.cpp ../src/plugin/test/*.cpp ../src/serialization/test/*.cpp) +file( + GLOB_RECURSE + SOURCES + ./*.cpp + ../src/core/test/*.cpp + ../src/gopt/test/*.cpp + ../src/opr/test/*.cpp + ../src/plugin/test/*.cpp + ../src/serialization/test/*.cpp) if(MGE_WITH_JIT) - file(GLOB_RECURSE SOURCES_ ../src/jit/test/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ ../src/jit/test/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() if(MGE_WITH_DISTRIBUTED) - file(GLOB_RECURSE SOURCES_ ../src/opr-mm/test/*.cpp) - list(APPEND SOURCES ${SOURCES_}) + file(GLOB_RECURSE SOURCES_ ../src/opr-mm/test/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() -if (MGE_WITH_CUDA AND MGE_WITH_TRT) - file(GLOB_RECURSE SOURCES_ ../src/tensorrt/test/*.cpp) - list(APPEND SOURCES ${SOURCES_}) +if(MGE_WITH_CUDA AND MGE_WITH_TRT) + file(GLOB_RECURSE SOURCES_ ../src/tensorrt/test/*.cpp) + list(APPEND SOURCES ${SOURCES_}) endif() add_executable(megbrain_test ${SOURCES}) if(WIN32 OR MSVC) - target_compile_definitions(megbrain_test PRIVATE MGE_WINDOWS_STATIC_LINK) + target_compile_definitions(megbrain_test PRIVATE MGE_WINDOWS_STATIC_LINK) endif() target_link_libraries(megbrain_test gtest gmock) target_link_libraries(megbrain_test megbrain megdnn ${MGE_CUDA_LIBS}) -if (MGE_WITH_CUDA) - target_include_directories(megbrain_test PRIVATE ${CUDNN_INCLUDE_DIR}) +if(MGE_WITH_CUDA) + target_include_directories(megbrain_test PRIVATE ${CUDNN_INCLUDE_DIR}) endif() if(CXX_SUPPORT_WCLASS_MEMACCESS) - if(MGE_WITH_CUDA) - target_compile_options(megbrain_test PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" - "$<$>:-Wno-class-memaccess>") - else() - target_compile_options(megbrain_test PRIVATE "-Wno-class-memaccess") - endif() + if(MGE_WITH_CUDA) + target_compile_options( + megbrain_test + PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" + "$<$>:-Wno-class-memaccess>") + else() + target_compile_options(megbrain_test PRIVATE "-Wno-class-memaccess") + endif() endif() if(UNIX) - if(APPLE OR ANDROID) - target_link_libraries(megbrain_test dl) - else() - target_link_libraries(megbrain_test dl rt) - endif() + if(APPLE OR ANDROID) + target_link_libraries(megbrain_test dl) + else() + target_link_libraries(megbrain_test dl rt) + endif() endif() -if (MGE_WITH_DISTRIBUTED) - target_link_libraries(megbrain_test megray) +if(MGE_WITH_DISTRIBUTED) + target_link_libraries(megbrain_test megray) endif() if(MGE_WITH_JIT) - if(MGE_WITH_JIT_MLIR) - add_subdirectory(${PROJECT_SOURCE_DIR}/src/jit/test/mlir ${CMAKE_CURRENT_BINARY_DIR}/../src/jit/test/mlir) - endif() + if(MGE_WITH_JIT_MLIR) + add_subdirectory(${PROJECT_SOURCE_DIR}/src/jit/test/mlir + ${CMAKE_CURRENT_BINARY_DIR}/../src/jit/test/mlir) + endif() endif() diff --git a/toolchains/aarch64-linux-gnu.toolchain.cmake b/toolchains/aarch64-linux-gnu.toolchain.cmake index 525817b3..cb09256f 100644 --- a/toolchains/aarch64-linux-gnu.toolchain.cmake +++ b/toolchains/aarch64-linux-gnu.toolchain.cmake @@ -2,8 +2,8 @@ set(ARM_CROSS_BUILD_ARCH aarch64) set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc") set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++") if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") endif() set(CMAKE_STRIP "aarch64-linux-gnu-strip") set(CMAKE_SYSTEM_PROCESSOR aarch64) diff --git a/toolchains/aarch64-none-linux-gnu.toolchain.cmake b/toolchains/aarch64-none-linux-gnu.toolchain.cmake index e16d3766..637c0c36 100644 --- a/toolchains/aarch64-none-linux-gnu.toolchain.cmake +++ b/toolchains/aarch64-none-linux-gnu.toolchain.cmake @@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "aarch64-none-linux-gnu-g++") set(CMAKE_C_FLAGS "-Wno-psabi") set(CMAKE_CXX_FLAGS "-Wno-psabi") if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") endif() set(CMAKE_STRIP "aarch64-none-linux-gnu-strip") set(CMAKE_SYSTEM_PROCESSOR aarch64) diff --git a/toolchains/arm-linux-gnueabi.toolchain.cmake b/toolchains/arm-linux-gnueabi.toolchain.cmake index 471b7806..bc4bc229 100644 --- a/toolchains/arm-linux-gnueabi.toolchain.cmake +++ b/toolchains/arm-linux-gnueabi.toolchain.cmake @@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++") set(CMAKE_C_FLAGS "-mfloat-abi=softfp -mfpu=neon-vfpv4 -Wno-psabi") set(CMAKE_CXX_FLAGS "-mfloat-abi=softfp -mfpu=neon-vfpv4 -Wno-psabi") if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") endif() set(CMAKE_STRIP "arm-linux-gnueabi-strip") set(CMAKE_SYSTEM_PROCESSOR armv7) diff --git a/toolchains/arm-linux-gnueabihf.toolchain.cmake b/toolchains/arm-linux-gnueabihf.toolchain.cmake index b9e36412..4e29ae6d 100644 --- a/toolchains/arm-linux-gnueabihf.toolchain.cmake +++ b/toolchains/arm-linux-gnueabihf.toolchain.cmake @@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++") set(CMAKE_C_FLAGS "-mfloat-abi=hard -mfpu=neon-vfpv4 -Wno-psabi") set(CMAKE_CXX_FLAGS "-mfloat-abi=hard -mfpu=neon-vfpv4 -Wno-psabi") if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter") endif() set(CMAKE_STRIP "arm-linux-gnueabihf-strip") set(CMAKE_SYSTEM_PROCESSOR armv7) diff --git a/toolchains/ios.toolchain.cmake b/toolchains/ios.toolchain.cmake index 26eabf51..c57174ff 100644 --- a/toolchains/ios.toolchain.cmake +++ b/toolchains/ios.toolchain.cmake @@ -1,103 +1,87 @@ # This file is part of the ios-cmake project. It was retrieved from # https://github.com/cristeab/ios-cmake.git, which is a fork of -# https://code.google.com/p/ios-cmake/. Which in turn is based off of -# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which -# are included with CMake 2.8.4 +# https://code.google.com/p/ios-cmake/. Which in turn is based off of the +# Platform/Darwin.cmake and Platform/UnixPaths.cmake files which are included with CMake +# 2.8.4 # # The ios-cmake project is licensed under the new BSD license. # -# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, -# Kitware, Inc., Insight Software Consortium. All rights reserved. -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. +# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, Kitware, Inc., +# Insight Software Consortium. All rights reserved. Redistribution and use in source +# and binary forms, with or without modification, are permitted provided that the +# following conditions are met: 1. Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following disclaimer. # -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. +# 1. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other +# materials provided with the distribution. # -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. +# 1. Neither the name of the copyright holder nor the names of its contributors may be +# used to endorse or promote products derived from this software without specific +# prior written permission. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +# DAMAGE. # -# This file is based off of the Platform/Darwin.cmake and -# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 -# It has been altered for iOS development. +# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake files +# which are included with CMake 2.8.4 It has been altered for iOS development. # # Updated by Alex Stewart (alexs.mac@gmail.com) # # ***************************************************************************** -# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) -# under the BSD-3-Clause license -# https://github.com/leetal/ios-cmake +# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) under the +# BSD-3-Clause license https://github.com/leetal/ios-cmake # ***************************************************************************** # -# INFORMATION / HELP +# INFORMATION / HELP # # The following variables control the behaviour of this toolchain: # -# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS or WATCHOS or SIMULATOR_WATCHOS -# OS = Build for iPhoneOS. -# OS64 = Build for arm64 arm64e iPhoneOS. -# SIMULATOR = Build for x86 i386 iPhone Simulator. -# SIMULATOR64 = Build for x86_64 iPhone Simulator. -# TVOS = Build for AppleTVOS. -# SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator. -# WATCHOS = Build for armv7k arm64_32 for WatchOS. -# SIMULATOR_WATCHOS = Build for x86_64 for Watch Simulator. -# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use. By default this is -# automatically determined from IOS_PLATFORM and xcodebuild, but -# can also be manually specified (although this should not be required). -# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform -# being compiled for. By default this is automatically determined from -# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should -# not be required). -# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true) -# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default) -# ENABLE_VISIBILITY: (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility hidden by default) -# IOS_ARCH: (armv7 armv7s armv7k arm64 arm64e arm64_32 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM -# OS = armv7 armv7s arm64 arm64e (if applicable) -# OS64 = arm64 arm64e (if applicable) -# SIMULATOR = i386 x86_64 -# SIMULATOR64 = x86_64 -# TVOS = arm64 -# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) -# WATCHOS = armv7k arm64_32 (if applicable) -# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) +# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS or +# WATCHOS or SIMULATOR_WATCHOS OS = Build for iPhoneOS. OS64 = Build for arm64 arm64e +# iPhoneOS. SIMULATOR = Build for x86 i386 iPhone Simulator. SIMULATOR64 = Build for +# x86_64 iPhone Simulator. TVOS = Build for AppleTVOS. SIMULATOR_TVOS = Build for x86_64 +# AppleTV Simulator. WATCHOS = Build for armv7k arm64_32 for WatchOS. SIMULATOR_WATCHOS +# = Build for x86_64 for Watch Simulator. CMAKE_OSX_SYSROOT: Path to the iOS SDK to use. +# By default this is automatically determined from IOS_PLATFORM and xcodebuild, but can +# also be manually specified (although this should not be required). +# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform being +# compiled for. By default this is automatically determined from CMAKE_OSX_SYSROOT, but +# can also be manually specified (although this should not be required). ENABLE_BITCODE: +# (1|0) Enables or disables bitcode support. Default 1 (true) ENABLE_ARC: (1|0) Enables +# or disables ARC support. Default 1 (true, ARC enabled by default) ENABLE_VISIBILITY: +# (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility +# hidden by default) IOS_ARCH: (armv7 armv7s armv7k arm64 arm64e arm64_32 i386 x86_64) +# If specified, will override the default architectures for the given IOS_PLATFORM OS = +# armv7 armv7s arm64 arm64e (if applicable) OS64 = arm64 arm64e (if applicable) +# SIMULATOR = i386 x86_64 SIMULATOR64 = x86_64 TVOS = arm64 SIMULATOR_TVOS = x86_64 +# (i386 has since long been deprecated) WATCHOS = armv7k arm64_32 (if applicable) +# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) # # This toolchain defines the following variables for use externally: # # XCODE_VERSION: Version number (not including Build version) of Xcode detected. -# IOS_SDK_VERSION: Version of iOS SDK being used. -# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from -# IOS_PLATFORM). +# IOS_SDK_VERSION: Version of iOS SDK being used. CMAKE_OSX_ARCHITECTURES: Architectures +# being compiled for (generated from IOS_PLATFORM). # # This toolchain defines the following macros for use externally: # -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) -# A convenience macro for setting xcode specific properties on targets. -# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). +# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) A convenience +# macro for setting xcode specific properties on targets. Available variants are: All, +# Release, RelWithDebInfo, Debug, MinSizeRel example: set_xcode_property (myioslib +# IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). # -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the -# iOS environment. Thanks to the android-cmake project for providing the -# command. +# find_host_package (PROGRAM ARGS) A macro used to find executable programs on the host +# system, not within the iOS environment. Thanks to the android-cmake project for +# providing the command. # Fix for PThread library not in path set(CMAKE_THREAD_LIBS_INIT "-lpthread") @@ -106,57 +90,58 @@ set(CMAKE_USE_WIN32_THREADS_INIT 0) set(CMAKE_USE_PTHREADS_INIT 1) # Get the Xcode version being used. -execute_process(COMMAND xcodebuild -version +execute_process( + COMMAND xcodebuild -version OUTPUT_VARIABLE XCODE_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") message(STATUS "Building with Xcode version: ${XCODE_VERSION}") -# Default to building for iPhoneOS if not specified otherwise, and we cannot -# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use -# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly -# determine the value of IOS_PLATFORM from the root project, as -# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake. -if (NOT DEFINED IOS_PLATFORM) - if (CMAKE_OSX_ARCHITECTURES) - if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*") +# Default to building for iPhoneOS if not specified otherwise, and we cannot determine +# the platform from the CMAKE_OSX_ARCHITECTURES variable. The use of +# CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly determine +# the value of IOS_PLATFORM from the root project, as CMAKE_OSX_ARCHITECTURES is +# propagated to them by CMake. +if(NOT DEFINED IOS_PLATFORM) + if(CMAKE_OSX_ARCHITECTURES) + if(CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*") set(IOS_PLATFORM "OS") - elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386") + elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386") set(IOS_PLATFORM "SIMULATOR") - elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") + elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") set(IOS_PLATFORM "SIMULATOR64") - elseif (CMAKE_OSX_ARCHITECTURES MATCHES "armv7k") + elseif(CMAKE_OSX_ARCHITECTURES MATCHES "armv7k") set(IOS_PLATFORM "WATCHOS") endif() endif() - if (NOT IOS_PLATFORM) + if(NOT IOS_PLATFORM) set(IOS_PLATFORM "OS") endif() endif() -set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING - "Type of iOS platform for which to build.") -# Determine the platform name and architectures for use in xcodebuild commands -# from the specified IOS_PLATFORM name. -if (IOS_PLATFORM STREQUAL "OS") +set(IOS_PLATFORM + ${IOS_PLATFORM} + CACHE STRING "Type of iOS platform for which to build.") +# Determine the platform name and architectures for use in xcodebuild commands from the +# specified IOS_PLATFORM name. +if(IOS_PLATFORM STREQUAL "OS") set(XCODE_IOS_PLATFORM iphoneos) if(NOT IOS_ARCH) - if (XCODE_VERSION VERSION_GREATER 10.0) + if(XCODE_VERSION VERSION_GREATER 10.0) set(IOS_ARCH armv7 armv7s arm64 arm64e) else() set(IOS_ARCH armv7 armv7s arm64) endif() endif() - elseif (IOS_PLATFORM STREQUAL "OS64") +elseif(IOS_PLATFORM STREQUAL "OS64") set(XCODE_IOS_PLATFORM iphoneos) if(NOT IOS_ARCH) - if (XCODE_VERSION VERSION_GREATER 10.0) + if(XCODE_VERSION VERSION_GREATER 10.0) set(IOS_ARCH arm64 arm64e) else() set(IOS_ARCH arm64) endif() endif() -elseif (IOS_PLATFORM STREQUAL "SIMULATOR") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR") set(XCODE_IOS_PLATFORM iphonesimulator) if(NOT IOS_ARCH) set(IOS_ARCH i386 x86_64) @@ -166,26 +151,26 @@ elseif(IOS_PLATFORM STREQUAL "SIMULATOR64") if(NOT IOS_ARCH) set(IOS_ARCH x86_64) endif() -elseif (IOS_PLATFORM STREQUAL "TVOS") +elseif(IOS_PLATFORM STREQUAL "TVOS") set(XCODE_IOS_PLATFORM appletvos) if(NOT IOS_ARCH) set(IOS_ARCH arm64) endif() -elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") set(XCODE_IOS_PLATFORM appletvsimulator) if(NOT IOS_ARCH) set(IOS_ARCH x86_64) endif() -elseif (IOS_PLATFORM STREQUAL "WATCHOS") +elseif(IOS_PLATFORM STREQUAL "WATCHOS") set(XCODE_IOS_PLATFORM watchos) if(NOT IOS_ARCH) - if (XCODE_VERSION VERSION_GREATER 10.0) + if(XCODE_VERSION VERSION_GREATER 10.0) set(IOS_ARCH armv7k arm64_32) else() set(IOS_ARCH armv7k) endif() endif() -elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") set(XCODE_IOS_PLATFORM watchsimulator) if(NOT IOS_ARCH) set(IOS_ARCH x86_64) @@ -194,130 +179,166 @@ else() message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}") endif() message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, " - "architecture(s): ${IOS_ARCH}") + "architecture(s): ${IOS_ARCH}") # If user did not specify the SDK root to use, then query xcodebuild for it. -execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path - OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT - OUTPUT_QUIET ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) +execute_process( + COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT + OUTPUT_QUIET ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) # If user did not specify the SDK root to use, then query xcodebuild for it. -if (NOT DEFINED CMAKE_OSX_SYSROOT OR (NOT CMAKE_OSX_SYSROOT STREQUAL CMAKE_OSX_SYSROOT_INT)) - execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path +if(NOT DEFINED CMAKE_OSX_SYSROOT OR (NOT CMAKE_OSX_SYSROOT STREQUAL + CMAKE_OSX_SYSROOT_INT)) + execute_process( + COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path OUTPUT_VARIABLE CMAKE_OSX_SYSROOT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) endif() -if (NOT EXISTS ${CMAKE_OSX_SYSROOT}) - message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" - "is pointing to the correct path. Please run:" - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" - "and see if that fixes the problem for you.") +if(NOT EXISTS ${CMAKE_OSX_SYSROOT}) + message( + SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" + "is pointing to the correct path. Please run:" + "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" + "and see if that fixes the problem for you.") message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " - "does not exist.") + "does not exist.") elseif(DEFINED CMAKE_OSX_SYSROOT) - message(STATUS "Using manually set SDK path: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") + message( + STATUS + "Using manually set SDK path: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") else() - message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") + message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") endif() # Specify minimum version of deployment target. -if (NOT DEFINED IOS_DEPLOYMENT_TARGET) - if (IOS_PLATFORM STREQUAL "WATCHOS" OR IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") - # Unless specified, SDK version 2.0 is used by default as minimum target version (watchOS). - set(IOS_DEPLOYMENT_TARGET "2.0" - CACHE STRING "Minimum iOS version to build for." ) +if(NOT DEFINED IOS_DEPLOYMENT_TARGET) + if(IOS_PLATFORM STREQUAL "WATCHOS" OR IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") + # Unless specified, SDK version 2.0 is used by default as minimum target version + # (watchOS). + set(IOS_DEPLOYMENT_TARGET + "2.0" + CACHE STRING "Minimum iOS version to build for.") else() - # Unless specified, SDK version 10.0 is used by default as minimum target version (iOS, tvOS). - set(IOS_DEPLOYMENT_TARGET "10.0" - CACHE STRING "Minimum iOS version to build for." ) + # Unless specified, SDK version 10.0 is used by default as minimum target version + # (iOS, tvOS). + set(IOS_DEPLOYMENT_TARGET + "10.0" + CACHE STRING "Minimum iOS version to build for.") endif() - message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!") + message( + STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!") endif() # Use bitcode or not -if (NOT DEFINED ENABLE_BITCODE AND NOT IOS_ARCH MATCHES "((^|, )(i386|x86_64))+") +if(NOT DEFINED ENABLE_BITCODE AND NOT IOS_ARCH MATCHES "((^|, )(i386|x86_64))+") # Unless specified, enable bitcode support by default - set(ENABLE_BITCODE TRUE CACHE BOOL "Whether or not to enable bitcode") + set(ENABLE_BITCODE + TRUE + CACHE BOOL "Whether or not to enable bitcode") message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!") endif() -if (NOT DEFINED ENABLE_BITCODE) - message(STATUS "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!") +if(NOT DEFINED ENABLE_BITCODE) + message( + STATUS + "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!" + ) endif() # Use ARC or not -if (NOT DEFINED ENABLE_ARC) +if(NOT DEFINED ENABLE_ARC) # Unless specified, enable ARC support by default - set(ENABLE_ARC TRUE CACHE BOOL "Whether or not to enable ARC") + set(ENABLE_ARC + TRUE + CACHE BOOL "Whether or not to enable ARC") message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!") endif() # Use hidden visibility or not -if (NOT DEFINED ENABLE_VISIBILITY) +if(NOT DEFINED ENABLE_VISIBILITY) # Unless specified, disable symbols visibility by default - set(ENABLE_VISIBILITY FALSE CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)") - message(STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") + set(ENABLE_VISIBILITY + FALSE + CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)") + message( + STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") endif() # Get the SDK version information. -execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion +execute_process( + COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion OUTPUT_VARIABLE IOS_SDK_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -# Find the Developer root for the specific iOS platform being compiled for -# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in -# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain -# this information from xcrun or xcodebuild. -if (NOT CMAKE_IOS_DEVELOPER_ROOT) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +# Find the Developer root for the specific iOS platform being compiled for from +# CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in CMAKE_OSX_SYSROOT. There +# does not appear to be a direct way to obtain this information from xcrun or +# xcodebuild. +if(NOT CMAKE_IOS_DEVELOPER_ROOT) get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH) get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH) endif() -if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT}) +if(NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT}) message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: " - "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.") + "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.") endif() # Find the C & C++ compilers for the specified SDK. -if (NOT CMAKE_C_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang +if(NOT CMAKE_C_COMPILER) + execute_process( + COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang OUTPUT_VARIABLE CMAKE_C_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") endif() -if (NOT CMAKE_CXX_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ +if(NOT CMAKE_CXX_COMPILER) + execute_process( + COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ OUTPUT_VARIABLE CMAKE_CXX_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") endif() # Find (Apple's) libtool. -execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool +execute_process( + COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool OUTPUT_VARIABLE IOS_LIBTOOL - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) message(STATUS "Using libtool: ${IOS_LIBTOOL}") -# Configure libtool to be used instead of ar + ranlib to build static libraries. -# This is required on Xcode 7+, but should also work on previous versions of -# Xcode. +# Configure libtool to be used instead of ar + ranlib to build static libraries. This is +# required on Xcode 7+, but should also work on previous versions of Xcode. set(CMAKE_C_CREATE_STATIC_LIBRARY - "${IOS_LIBTOOL} -static -o ") + "${IOS_LIBTOOL} -static -o ") set(CMAKE_CXX_CREATE_STATIC_LIBRARY - "${IOS_LIBTOOL} -static -o ") + "${IOS_LIBTOOL} -static -o ") # Get the version of Darwin (OS X) of the host. -execute_process(COMMAND uname -r +execute_process( + COMMAND uname -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) # Standard settings. -set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "") -set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "") -set(UNIX TRUE CACHE BOOL "") -set(APPLE TRUE CACHE BOOL "") -set(IOS TRUE CACHE BOOL "") -set(CMAKE_AR ar CACHE FILEPATH "" FORCE) -set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE) -# Force unset of OS X-specific deployment target (otherwise autopopulated), -# required as of cmake 2.8.10. -set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING - "Must be empty for iOS builds." FORCE) +set(CMAKE_SYSTEM_NAME + Darwin + CACHE INTERNAL "") +set(CMAKE_SYSTEM_VERSION + ${IOS_SDK_VERSION} + CACHE INTERNAL "") +set(UNIX + TRUE + CACHE BOOL "") +set(APPLE + TRUE + CACHE BOOL "") +set(IOS + TRUE + CACHE BOOL "") +set(CMAKE_AR + ar + CACHE FILEPATH "" FORCE) +set(CMAKE_RANLIB + ranlib + CACHE FILEPATH "" FORCE) +# Force unset of OS X-specific deployment target (otherwise autopopulated), required as +# of cmake 2.8.10. +set(CMAKE_OSX_DEPLOYMENT_TARGET + "" + CACHE STRING "Must be empty for iOS builds." FORCE) # Set the architectures for which to build. -set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS") -# Change the type of target generated for try_compile() so it'll work when cross-compiling +set(CMAKE_OSX_ARCHITECTURES + ${IOS_ARCH} + CACHE STRING "Build architecture for iOS") +# Change the type of target generated for try_compile() so it'll work when +# cross-compiling set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) # All iOS/Darwin specific settings - some may be redundant. set(CMAKE_SHARED_LIBRARY_PREFIX "lib") @@ -332,7 +353,8 @@ set(CMAKE_MODULE_EXISTS 1) set(CMAKE_DL_LIBS "") set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") +set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG + "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") if(IOS_ARCH MATCHES "((^|, )(arm64|arm64e|x86_64))+") @@ -350,35 +372,32 @@ message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}" # Note that only Xcode 7+ supports the newer more specific: # -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use: # -m(ios/ios-simulator)-version-min instead. -if (IOS_PLATFORM STREQUAL "OS" OR IOS_PLATFORM STREQUAL "OS64") - if (XCODE_VERSION VERSION_LESS 7.0) - set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") +if(IOS_PLATFORM STREQUAL "OS" OR IOS_PLATFORM STREQUAL "OS64") + if(XCODE_VERSION VERSION_LESS 7.0) + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") else() # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") + "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") endif() -elseif (IOS_PLATFORM STREQUAL "TVOS") - set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}") -elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") - set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") -elseif (IOS_PLATFORM STREQUAL "WATCHOS") +elseif(IOS_PLATFORM STREQUAL "TVOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mwatchos-version-min=${IOS_DEPLOYMENT_TARGET}") -elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") + "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif(IOS_PLATFORM STREQUAL "WATCHOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mwatchos-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mwatchos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") + "-mwatchos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") else() # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. set(XCODE_IOS_PLATFORM_VERSION_FLAGS - "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") + "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") endif() message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}") -if (ENABLE_BITCODE) +if(ENABLE_BITCODE) set(BITCODE "-fembed-bitcode") set(HEADER_PAD "") message(STATUS "Enabling bitcode support.") @@ -388,7 +407,7 @@ else() message(STATUS "Disabling bitcode support.") endif() -if (ENABLE_ARC) +if(ENABLE_ARC) set(FOBJC_ARC "-fobjc-arc") message(STATUS "Enabling ARC support.") else() @@ -396,7 +415,7 @@ else() message(STATUS "Disabling ARC support.") endif() -if (NOT ENABLE_VISIBILITY) +if(NOT ENABLE_VISIBILITY) set(VISIBILITY "-fvisibility=hidden") message(STATUS "Hiding symbols (-fvisibility=hidden).") else() @@ -404,20 +423,31 @@ else() endif() set(CMAKE_C_FLAGS -"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}") + "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}" +) # Hidden visibilty is required for C++ on iOS. set(CMAKE_CXX_FLAGS -"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}") -set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_MINSIZEREL}") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELEASE}") -set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") -set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") + "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}" +) +set(CMAKE_CXX_FLAGS_MINSIZEREL + "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_MINSIZEREL}" +) +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO + "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" +) +set(CMAKE_CXX_FLAGS_RELEASE + "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELEASE}") +set(CMAKE_C_LINK_FLAGS + "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") +set(CMAKE_CXX_LINK_FLAGS + "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}" +) -# In order to ensure that the updated compiler flags are used in try_compile() -# tests, we have to forcibly set them in the CMake cache, not merely set them -# in the local scope. -list(APPEND VARS_TO_FORCE_IN_CACHE +# In order to ensure that the updated compiler flags are used in try_compile() tests, we +# have to forcibly set them in the CMake cache, not merely set them in the local scope. +list( + APPEND + VARS_TO_FORCE_IN_CACHE CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELWITHDEBINFO @@ -426,37 +456,40 @@ list(APPEND VARS_TO_FORCE_IN_CACHE CMAKE_C_LINK_FLAGS CMAKE_CXX_LINK_FLAGS) foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE}) - set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "") + set(${VAR_TO_FORCE} + "${${VAR_TO_FORCE}}" + CACHE STRING "") endforeach() set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set (CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") +set(CMAKE_SHARED_LINKER_FLAGS + "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib ${HEADER_PAD}") set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle ${HEADER_PAD}") set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") -# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old -# build tree (where install_name_tool was hardcoded) and where -# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in -# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode -# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did +# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree +# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the +# cache and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) +# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did # before, Alex. -if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) +if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) -endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) +endif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) # Set the find root to the iOS developer roots and to user defined paths. -set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT} - ${CMAKE_PREFIX_PATH} CACHE STRING "iOS find search path root" FORCE) +set(CMAKE_FIND_ROOT_PATH + ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT} ${CMAKE_PREFIX_PATH} + CACHE STRING "iOS find search path root" FORCE) # Default to searching for frameworks first. set(CMAKE_FIND_FRAMEWORK FIRST) # Set up the default search directories for frameworks. set(CMAKE_SYSTEM_FRAMEWORK_PATH - ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks - ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks) + ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks + ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks + ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks) # Only search the specified iOS SDK, not the remainder of the host filesystem. set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) @@ -464,12 +497,14 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) # This little macro lets you set any XCode specific property. macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") - if (XCODE_RELVERSION_I STREQUAL "All") - set_property(TARGET ${TARGET} PROPERTY - XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") + if(XCODE_RELVERSION_I STREQUAL "All") + set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} + "${XCODE_VALUE}") else() - set_property(TARGET ${TARGET} PROPERTY - XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") + set_property( + TARGET ${TARGET} + PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] + "${XCODE_VALUE}") endif() endmacro(set_xcode_property) # This macro lets you find executable programs on the host system. diff --git a/toolchains/riscv64-linux-gnu.toolchain.cmake b/toolchains/riscv64-linux-gnu.toolchain.cmake index d90ad0c4..3100b0de 100644 --- a/toolchains/riscv64-linux-gnu.toolchain.cmake +++ b/toolchains/riscv64-linux-gnu.toolchain.cmake @@ -3,16 +3,18 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64) set(RISCV_CROSS_BUILD_ARCH riscv64) if(DEFINED ENV{RISCV_TOOLCHAIN_ROOT}) - file(TO_CMAKE_PATH $ENV{RISCV_TOOLCHAIN_ROOT} RISCV_TOOLCHAIN_ROOT) + file(TO_CMAKE_PATH $ENV{RISCV_TOOLCHAIN_ROOT} RISCV_TOOLCHAIN_ROOT) else() - message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT env must be defined") + message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT env must be defined") endif() -set(RISCV_TOOLCHAIN_ROOT ${RISCV_TOOLCHAIN_ROOT} CACHE STRING "root path to riscv toolchain") +set(RISCV_TOOLCHAIN_ROOT + ${RISCV_TOOLCHAIN_ROOT} + CACHE STRING "root path to riscv toolchain") set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc") set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++") set(CMAKE_FIND_ROOT_PATH "${RISCV_TOOLCHAIN_ROOT}/riscv64-unknown-linux-gnu") set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) \ No newline at end of file +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/tools/cmake_format_config.json b/tools/cmake_format_config.json new file mode 100644 index 00000000..a7fd7394 --- /dev/null +++ b/tools/cmake_format_config.json @@ -0,0 +1,311 @@ +{ + "_help_parse": "Options affecting listfile parsing", + "parse": { + "_help_additional_commands": [ + "Specify structure for custom cmake functions" + ], + "additional_commands": { + "foo": { + "flags": [ + "BAR", + "BAZ" + ], + "kwargs": { + "HEADERS": "*", + "SOURCES": "*", + "DEPENDS": "*" + } + } + }, + "_help_override_spec": [ + "Override configurations per-command where available" + ], + "override_spec": {}, + "_help_vartags": [ + "Specify variable tags." + ], + "vartags": [], + "_help_proptags": [ + "Specify property tags." + ], + "proptags": [] + }, + "_help_format": "Options affecting formatting.", + "format": { + "_help_disable": [ + "Disable formatting entirely, making cmake-format a no-op" + ], + "disable": false, + "_help_line_width": [ + "How wide to allow formatted cmake files" + ], + "line_width": 88, + "_help_tab_size": [ + "How many spaces to tab for indent" + ], + "tab_size": 2, + "_help_use_tabchars": [ + "If true, lines are indented using tab characters (utf-8", + "0x09) instead of space characters (utf-8 0x20).", + "In cases where the layout would require a fractional tab", + "character, the behavior of the fractional indentation is", + "governed by " + ], + "use_tabchars": false, + "_help_fractional_tab_policy": [ + "If is True, then the value of this variable", + "indicates how fractional indentions are handled during", + "whitespace replacement. If set to 'use-space', fractional", + "indentation is left as spaces (utf-8 0x20). If set to", + "`round-up` fractional indentation is replaced with a single", + "tab character (utf-8 0x09) effectively shifting the column", + "to the next tabstop" + ], + "fractional_tab_policy": "use-space", + "_help_max_subgroups_hwrap": [ + "If an argument group contains more than this many sub-groups", + "(parg or kwarg groups) then force it to a vertical layout." + ], + "max_subgroups_hwrap": 2, + "_help_max_pargs_hwrap": [ + "If a positional argument group contains more than this many", + "arguments, then force it to a vertical layout." + ], + "max_pargs_hwrap": 6, + "_help_max_rows_cmdline": [ + "If a cmdline positional group consumes more than this many", + "lines without nesting, then invalidate the layout (and nest)" + ], + "max_rows_cmdline": 2, + "_help_separate_ctrl_name_with_space": [ + "If true, separate flow control names from their parentheses", + "with a space" + ], + "separate_ctrl_name_with_space": false, + "_help_separate_fn_name_with_space": [ + "If true, separate function names from parentheses with a", + "space" + ], + "separate_fn_name_with_space": false, + "_help_dangle_parens": [ + "If a statement is wrapped to more than one line, than dangle", + "the closing parenthesis on its own line." + ], + "dangle_parens": false, + "_help_dangle_align": [ + "If the trailing parenthesis must be 'dangled' on its on", + "line, then align it to this reference: `prefix`: the start", + "of the statement, `prefix-indent`: the start of the", + "statement, plus one indentation level, `child`: align to", + "the column of the arguments" + ], + "dangle_align": "prefix", + "_help_min_prefix_chars": [ + "If the statement spelling length (including space and", + "parenthesis) is smaller than this amount, then force reject", + "nested layouts." + ], + "min_prefix_chars": 4, + "_help_max_prefix_chars": [ + "If the statement spelling length (including space and", + "parenthesis) is larger than the tab width by more than this", + "amount, then force reject un-nested layouts." + ], + "max_prefix_chars": 10, + "_help_max_lines_hwrap": [ + "If a candidate layout is wrapped horizontally but it exceeds", + "this many lines, then reject the layout." + ], + "max_lines_hwrap": 2, + "_help_line_ending": [ + "What style line endings to use in the output." + ], + "line_ending": "unix", + "_help_command_case": [ + "Format command names consistently as 'lower' or 'upper' case" + ], + "command_case": "canonical", + "_help_keyword_case": [ + "Format keywords consistently as 'lower' or 'upper' case" + ], + "keyword_case": "unchanged", + "_help_always_wrap": [ + "A list of command names which should always be wrapped" + ], + "always_wrap": [], + "_help_enable_sort": [ + "If true, the argument lists which are known to be sortable", + "will be sorted lexicographicall" + ], + "enable_sort": true, + "_help_autosort": [ + "If true, the parsers may infer whether or not an argument", + "list is sortable (without annotation)." + ], + "autosort": false, + "_help_require_valid_layout": [ + "By default, if cmake-format cannot successfully fit", + "everything into the desired linewidth it will apply the", + "last, most agressive attempt that it made. If this flag is", + "True, however, cmake-format will print error, exit with non-", + "zero status code, and write-out nothing" + ], + "require_valid_layout": false, + "_help_layout_passes": [ + "A dictionary mapping layout nodes to a list of wrap", + "decisions. See the documentation for more information." + ], + "layout_passes": {} + }, + "_help_markup": "Options affecting comment reflow and formatting.", + "markup": { + "_help_bullet_char": [ + "What character to use for bulleted lists" + ], + "bullet_char": "*", + "_help_enum_char": [ + "What character to use as punctuation after numerals in an", + "enumerated list" + ], + "enum_char": ".", + "_help_first_comment_is_literal": [ + "If comment markup is enabled, don't reflow the first comment", + "block in each listfile. Use this to preserve formatting of", + "your copyright/license statements." + ], + "first_comment_is_literal": false, + "_help_literal_comment_pattern": [ + "If comment markup is enabled, don't reflow any comment block", + "which matches this (regex) pattern. Default is `None`", + "(disabled)." + ], + "literal_comment_pattern": ".*INTERNAL.*", + "_help_fence_pattern": [ + "Regular expression to match preformat fences in comments", + "default= ``r'^\\s*([`~]{3}[`~]*)(.*)$'``" + ], + "fence_pattern": "^\\s*([`~]{3}[`~]*)(.*)$", + "_help_ruler_pattern": [ + "Regular expression to match rulers in comments default=", + "``r'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'``" + ], + "ruler_pattern": "^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$", + "_help_explicit_trailing_pattern": [ + "If a comment line matches starts with this pattern then it", + "is explicitly a trailing comment for the preceeding", + "argument. Default is '#<'" + ], + "explicit_trailing_pattern": "#<", + "_help_hashruler_min_length": [ + "If a comment line starts with at least this many consecutive", + "hash characters, then don't lstrip() them off. This allows", + "for lazy hash rulers where the first hash char is not", + "separated by space" + ], + "hashruler_min_length": 10, + "_help_canonicalize_hashrulers": [ + "If true, then insert a space between the first hash char and", + "remaining hash chars in a hash ruler, and normalize its", + "length to fill the column" + ], + "canonicalize_hashrulers": true, + "_help_enable_markup": [ + "enable comment markup parsing and reflow" + ], + "enable_markup": true + }, + "_help_lint": "Options affecting the linter", + "lint": { + "_help_disabled_codes": [ + "a list of lint codes to disable" + ], + "disabled_codes": [], + "_help_function_pattern": [ + "regular expression pattern describing valid function names" + ], + "function_pattern": "[0-9a-z_]+", + "_help_macro_pattern": [ + "regular expression pattern describing valid macro names" + ], + "macro_pattern": "[0-9A-Z_]+", + "_help_global_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with global (cache) scope" + ], + "global_var_pattern": "[A-Z][0-9A-Z_]+", + "_help_internal_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with global scope (but internal semantic)" + ], + "internal_var_pattern": "_[A-Z][0-9A-Z_]+", + "_help_local_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with local scope" + ], + "local_var_pattern": "[a-z][a-z0-9_]+", + "_help_private_var_pattern": [ + "regular expression pattern describing valid names for", + "privatedirectory variables" + ], + "private_var_pattern": "_[0-9a-z_]+", + "_help_public_var_pattern": [ + "regular expression pattern describing valid names for public", + "directory variables" + ], + "public_var_pattern": "[A-Z][0-9A-Z_]+", + "_help_argument_var_pattern": [ + "regular expression pattern describing valid names for", + "function/macro arguments and loop variables." + ], + "argument_var_pattern": "[a-z][a-z0-9_]+", + "_help_keyword_pattern": [ + "regular expression pattern describing valid names for", + "keywords used in functions or macros" + ], + "keyword_pattern": "[A-Z][0-9A-Z_]+", + "_help_max_conditionals_custom_parser": [ + "In the heuristic for C0201, how many conditionals to match", + "within a loop in before considering the loop a parser." + ], + "max_conditionals_custom_parser": 2, + "_help_min_statement_spacing": [ + "Require at least this many newlines between statements" + ], + "min_statement_spacing": 1, + "_help_max_statement_spacing": [ + "Require no more than this many newlines between statements" + ], + "max_statement_spacing": 2, + "max_returns": 6, + "max_branches": 12, + "max_arguments": 5, + "max_localvars": 15, + "max_statements": 50 + }, + "_help_encode": "Options affecting file encoding", + "encode": { + "_help_emit_byteorder_mark": [ + "If true, emit the unicode byte-order mark (BOM) at the start", + "of the file" + ], + "emit_byteorder_mark": false, + "_help_input_encoding": [ + "Specify the encoding of the input file. Defaults to utf-8" + ], + "input_encoding": "utf-8", + "_help_output_encoding": [ + "Specify the encoding of the output file. Defaults to utf-8.", + "Note that cmake only claims to support utf-8 so be careful", + "when using anything else" + ], + "output_encoding": "utf-8" + }, + "_help_misc": "Miscellaneous configurations options.", + "misc": { + "_help_per_command": [ + "A dictionary containing any per-command configuration", + "overrides. Currently only `command_case` is supported." + ], + "per_command": {} + } +} diff --git a/tools/cmakeformat.py b/tools/cmakeformat.py new file mode 100755 index 00000000..7c06f6e6 --- /dev/null +++ b/tools/cmakeformat.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import argparse +import os +import subprocess +from pathlib import Path + +CMAKE_FILS_DIRS = [ + "test", + "dnn", + "tools", + "sdk", + "src", + "imperative", + "lite", + "cmake", + "toolchains", +] + + +def main(): + os.chdir(str(Path(__file__).resolve().parent.parent)) + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("--check", action="store_true", help="check model") + parser.add_argument( + "--cmake_files", + nargs="+", + default=None, + dest="cmake_files", + help="cmake files to format, please split with space", + ) + args = parser.parse_args() + + handle_files = [] + if args.cmake_files: + handle_files = args.cmake_files + for cmake_file in handle_files: + assert os.path.isfile( + cmake_file + ), "error input --cmake_files, can not find file: {}".format(cmake_file) + else: + handle_files.append("CMakeLists.txt") + for cmake_file_dir in CMAKE_FILS_DIRS: + assert os.path.isdir( + cmake_file_dir + ), "{} is not a directory, may config error for CMAKE_FILS_DIRS".format( + cmake_file_dir + ) + for cmake_file in [ + os.path.join(root, file) + for root, dirs, files in os.walk(cmake_file_dir) + for file in files + if file.endswith("CMakeLists.txt") or file.endswith(".cmake") + ]: + print("find cmake_file: {}".format(cmake_file)) + assert os.path.isfile(cmake_file), "code issue happened!!" + handle_files.append(cmake_file) + + for cmake_file in handle_files: + handle_type = ["format", "--in-place"] + if args.check: + handle_type = ["check", "--check"] + cmd = "cmake-format -c tools/cmake_format_config.json {} {}".format( + handle_type[1], cmake_file + ) + print("try {}: {} with command: {}".format(handle_type[0], cmake_file, cmd)) + try: + subprocess.check_call(cmd, shell=True) + except Exception as exc: + print("run cmd {} failed".format(cmd)) + if args.check: + print( + 'please run: "python3 tools/cmakeformat.py" to format cmake files' + ) + else: + print("code issue happened!!, please FIXME!!") + raise exc + + +if __name__ == "__main__": + subprocess.check_call("python3 -m pip install cmakelang==0.6.13 --user", shell=True) + main() diff --git a/tools/format.py b/tools/format.py index af0c0afd..e2d6921f 100755 --- a/tools/format.py +++ b/tools/format.py @@ -19,7 +19,8 @@ failed_files = Manager().list() def process_file(file, clang_format, write): source = open(file, "r").read() - source = re.sub(r"MGB_DEFINE(?P(.|\n)*?)// +{", "class MGB_DEFINE\g{", source) + source = re.sub(r"MGB_DEFINE(?P([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g{", source) + source, count = re.subn(r"(?